## Import

In [267]:
# crawling
import requests as req
from user_agent import generate_navigator # 랜덤 헤더 생성 모듈
from bs4 import BeautifulSoup as bs

# to sign in
from accounts import jp
import json


# to dataframe
import pandas as pd


# async
import asyncio
import aiohttp


In [268]:
# 변수 선언부
JOBPLANET_URL = 'https://www.jobplanet.co.kr'
JOBPLANET_LOGIN_URL = JOBPLANET_URL + '/users/sign_in'
JOBPLANET_SEARCH_URL = JOBPLANET_URL + '/search?query={keyword}'
JOBPLANET_REVIEW_URL = JOBPLANET_URL + '/companies/{jp_comp_uid}/reviews?page={p}'
#SAVE_PATH = '/app/data/reviews/{comp_name}_job_planet.csv'

## code start

- login

In [269]:
# make request datas
def get_login_session():
    """
    잡플래닛 로그인을 시도해, request session 객체와 cookie 값을 반환합니다.
    
    returns ]
        session : request.sessions.Session  - 로그인 된 세션
        cookies : dict                      - 로그인 완료된 쿠키값
    """
    # ==============================
    # Header
    # ==============================
    # 랜덤으로 돌립니다.
    while True:
        HEADERS = generate_navigator(device_type="desktop", os=('mac', 'linux', 'win'))
        if None not in HEADERS.values():
            break
        
    # 나머지 헤더값 넣어주기
    HEADERS['Referer'] = 'https://www.jobplanet.co.kr/users/sign_in?_nav=gb'
    HEADERS['Origin'] = 'https://www.jobplanet.co.kr'
    HEADERS['Content-Type'] = 'application/json'
    HEADERS['Accpet'] = '*/*'
    HEADERS['Accpet-Encoding'] = 'gzip, deflate, br'
    HEADERS['Accept-Language'] = 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7'
    
    # ==============================
    # Payload
    # ==============================
    PAYLOAD = {
        'user': {
            'email': jp.ID,
            'password': jp.PW,
            'remember_me': False  
        }
    }
    
    # ==============================
    # make session to use cookie
    # ==============================
    session = req.Session()
    login_cookie = session.post(JOBPLANET_LOGIN_URL, data=json.dumps(PAYLOAD), headers=HEADERS).cookies.get_dict()
    # update cookies & headers
    session.cookies.update(login_cookie)
    session.headers.update(HEADERS)
    
    return session, session.cookies.get_dict() # 쿠키 따로 반환

In [270]:
# 확인용
session.cookies.get_dict()

{'__cf_bm': 'ORkqvvBwB5w91LOi_m2e88PX90Qzqpdoke3QV_uWxyE-1692851549-0-AX/kkze385t0YoQOUeqDnyVWg2iJBn95hjLr10YV4j8MpmxAM6kfOnU3BMdLL2CuRQ9OsP9h/tuJPMfVO8tfr3g=',
 '_intween_x2o_net_session': 'SW5hdGhjeEtoMDRKVExUKzJUdnJZS1orSlc0aWtaWUhkK05EODZvUUhFanU0b1prOFZOL3huZTdCYnZvZVAwY2tvSjVJRVVMa0xnU2pMd2xweWdWNlkvNHkyYlhsMFExNU1wZ3pJdmJUS044dkxzeGRoRmtacXUvMzBpZndpVTdnRUptWWp5ZjNzVVMxdmc5cmw2TXByT1Y2S240V0Z5cXJ5WHVUZkxLdjFFPS0tQ3VDMHptdmRnTDdHdkVQNE02eURldz09--40663360bc1aab642fb8a2f54fa542d4569f6fc1',
 '_jp_traffic_source': '%7B%22utm_campaign%22%3Anull%2C%22utm_medium%22%3Anull%2C%22utm_source%22%3Anull%7D',
 '_jp_visit_short_token': '1692851548879-7f78660e-e4b7-4a4b-a325-7898693d30f4',
 '_jp_visit_token': 'bdf8ea17-92b9-48f0-a052-fb897e489f99',
 '_jp_visitor_token': 'e28de7a4-5aef-40af-bfd6-bda9d5a5da09',
 'request_method': 'POST'}

In [271]:
def get_jobplanet_uid(headers, keyword):
    # 크롤링 준비
    res = req.get(JOBPLANET_SEARCH_URL.format(keyword=keyword), headers=headers)
    soup = bs(res.text, 'lxml')

    # 잡플래닛 내부 회사 ID 크롤링
    # b 태그 갖고 있는 a 태그만 추출 : 정확도 높은게 볼드체 처리됨
    a_tag = [el for el in soup.select('div.is_company_card a') if el.select_one('b')][0]

    # href format : /companies/{잡플래닛_회사ID}/info/{회사이름}?_rs_act=index&_rs_con=search&_rs_element=federated_search
    return a_tag.attrs['href'].split('/')[2]

In [272]:
def get_links_to_keyword(headers, jp_comp_uid):
    """
    제목까지 받아와야 하는 경우, 해당 함수의 주석과 get_news_list 함수의 주석을 해제 후 사용해야 합니다.
    parameter ]
        headers : dict  - header 정보
        keyword : str   - 해당 단어로 다음 검색(입력시 global 변수로 저장)
        
    return ]
        links   : list[str] - 리뷰 페이지들 링크
    """
    review_list = []
    p = 1
    
    for p in range(99):
        now_url = JOBPLANET_REVIEW_URL.format(jp_comp_uid=jp_comp_uid, p=p+1)
    
        res = req.get(now_url, headers=headers)
        soup = bs(res.text, 'lxml')
        
        # 리뷰 없는 경우
        if soup.select_one('article.no_result > .txt'): # '리뷰가 없습니다' 태그
            break
        
        # 통과한 경우 : 리뷰 있음, 리스트에 추가
        review_list.append(now_url)
    
    return review_list

In [259]:
def get_review_df(session, url):
    # 크롤링 준비
    # DataFrame 생성 list들
    position_list = []
    is_office_list = []
    review_date_list = []
    review_rate_list = []
    review_cont_list = []
    
    res = session.get(JOBPLANET_REVIEW_URL.format(jp_comp_uid=jp_comp_uid, v_page=page), headers=HEADERS)
    soup = bs(res.text, 'lxml')
    
    # =========================
    # 작성자 정보 크롤링
    # =========================
    reviewer_info_list = []
    # span.txt1 : [직무, (전직원|현직원), 지역, 작성 날짜] 의 연속으로 구성됨
    [reviewer_info_list.append(el.text.strip()) for el in soup.select('span.txt1')]

        
        
    # filtering from reviewer_info_list
    # 장점 / 단점 각각 정보가 들어가야 하기 때문에 두 번씩 들어가게 한다.
    for _ in range(2):
        position_list.extend(reviewer_info_list[::4])
        is_office_list.extend([False if el == '전직원' else True for el in reviewer_info_list[1::4]])
        # YYYY. MM -> YYYYMM
        review_date_list.extend([el.replace('. ', '') for el in reviewer_info_list[3::4]])
        
    # =========================
    # 리뷰 크롤링
    # =========================
    # 별점
    # width:{percent}% -> {percent} -> 1~5점 사이로 format
    # 장점 / 단점 각각 정보가 들어가야 하기 때문에 두 번씩 들어가게 한다.
    for _ in range(2):
        review_rate_list.extend([int(int(el.attrs['style'][6:-2]) / 20) for el in soup.select('div.star_score')])
    
    # 리뷰 내용 : react로 불러오기 때문에 데이터만 불러와 줍니다.
    review_cont_list.extend([el for el in soup.select('dt.merit+dd > span')]) # 장점
    review_cont_list.extend([el for el in soup.select('dt.disadvantages+dd > span')]) # 단점
    
    
    return position_list, is_office_list,  review_date_list, review_rate_list, review_cont_list

In [None]:
def filter_reviewer_info(one_reviewer_info):
    """
    한 리뷰어의 정보를 list로 받아, split해주는 함수
    parameters ]
        one_reviewer_info : list    - 리뷰어 한명의 정보
        
    return ]
        position    : str   - 직무
        is_office   : str   - 현직원/전직원 여부
        review_date : str   - 리뷰 작성 날짜
    """
    position = one_reviewer_info[0]
    is_office = False if one_reviewer_info[1] == '전직원' else True
    review_date = one_reviewer_info[3].replace('. ', '')
    
    return position, is_office, review_date

In [299]:
# 함수만 테스트
url = r'https://www.jobplanet.co.kr/companies/366153/reviews/%EB%A0%88%EB%AA%AC?page=14'

# 크롤링 준비
res = req_session.get(urls[0], headers=req_session.headers)
soup = bs(res.text, 'lxml')

# =========================
# 작성자 정보 크롤링
# =========================
reviewer_info_list = []
# span.txt1 : [직무, (전직원|현직원), 지역, 작성 날짜] 의 연속으로 구성됨
[reviewer_info_list.append(el.text.strip()) for el in soup.select('span.txt1')]


[['기획/경영', '전직원', '서울', '2023. 07'],
 ['미디어/홍보', '현직원', '서울', '2023. 06'],
 ['금융/재무', '현직원', '서울', '2023. 02'],
 ['개발', '현직원', '서울', '2023. 01'],
 ['개발', '현직원', '서울', '2023. 01']]

In [167]:
print('review_cont_list :::::', len(review_pos_list))
print('review_cont_list :::::', len(review_neg_list))
print('review_rate_list :::::', len(review_rate_list))
print('is_office_list :::::', len(is_office_list))
print('review_date_list :::::', len(review_date_list))
print('position_list :::::', len(position_list))


review_cont_list ::::: 5
review_cont_list ::::: 5
review_rate_list ::::: 136
is_office_list ::::: 136
review_date_list ::::: 136
position_list ::::: 136


In [257]:
async def get_content_list(cookies, urls):
    async with aiohttp.ClientSession(
        connector=aiohttp.TCPConnector(ssl=False)
    ) as session:
        session.cookie_jar.update_cookies(cookies, response_url=None)
        result = await asyncio.gather(*[get_content_to_link(session, url) for url in urls]) # wrapping도 내부 처리
    
    return result

In [274]:
# 테스트
keyword = '지쿱'
req_session, req_cookies = get_login_session()
jp_comp_uid = get_jobplanet_uid(req_session.headers, keyword)
urls = get_links_to_keyword(req_session.headers, jp_comp_uid)

  for attr in list(attrs.keys()):


In [260]:
# 데이터 크롤링
result = asyncio.run(get_content_list(req_cookies, urls))

RuntimeError: asyncio.run() cannot be called from a running event loop