# Web Crawler

In [1]:
import math
import time

# A compendium of commonly-used regular expressions.
import re

# The fundamental package for scientific computing with Python.
import numpy as np

# Flexible and powerful data analysis / manipulation library for Python, providing labeled data structures similar to R data.frame objects, statistical functions, and much more
import pandas as pd

In [2]:
# A Fast, Extensible Progress Bar for Python and CLI
from tqdm import tqdm

# A simple, yet elegant HTTP library.
import requests

# A browser automation framework and ecosystem.
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Beautiful Soup is a library that makes it easy to scrape information from web pages.
from bs4 import BeautifulSoup

In [3]:
# The old socket.ssl() support for TLS over sockets is being superseded in Python 2.6 by a new ‘ssl’ module. This package brings that module to older Python releases, 2.3.5 and up (it may also work on older versions of 2.3, but we haven’t tried it).
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    # Legacy Python that doesn't verify HTTPS certificates by default
    pass
else:
    # Handle target environment that doesn't support HTTPS verification
    ssl._create_default_https_context = _create_unverified_https_context

from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',options=options)
driver =webdriver.Chrome('chromedriver',options=options)

In [4]:


params = {
    'section': 'qna', # kin, qna, ency
    'period': '1w', # 1w, 1m, 2002.01.01.%7C2020.12.09.
    'page': 1,
    'query': '여자친구+선물',
}
params['query'] = re.sub(' ', '+', params['query'])

endpoint = 'https://kin.naver.com/search/list.nhn?section={}&period={}&page={}&query={}'\
    .format(params['section'], params['period'], params['page'], params['query'])

driver.get(endpoint)

In [5]:

WebDriverWait(driver, 10)

html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

In [6]:
print(soup.select_one('h2 span.number'))

<span class="number">(<em>1-10/33</em>)</span>


In [7]:

# paging
page_tag = soup.select_one('h2 span.number').get_text(strip=True)
page_txt = re.sub('\(|\)', '', page_tag).split('/')
num_of_rows = int(page_txt[0].split('-')[1])
total_count = int(page_txt[1])
total_page = math.ceil(total_count / num_of_rows)

In [8]:

# series
dataset = []

for page_no in tqdm(np.arange(1, total_page+1)):
    try:

        params.update({'page': page_no})

        endpoint = 'https://kin.naver.com/search/list.nhn?section={}&period={}&page={}&query={}'\
            .format(params['section'], params['period'], params['page'], params['query'])

        driver.get(endpoint)
        

        # parse
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        menuitem = soup.select('#s_content ul.basic1 li')

        for item in menuitem:
            try:

                element = item.select_one('._searchListTitleAnchor')
                text = element.get_text(strip=True)
                href = element.get('href')

                data = {}
                data.update({ 'title':text, 'href':href })

                for arr in re.sub('.*\?', '', href).split('&'):
                    txt = arr.split('=')
                    data.update({ txt[0]:txt[1] })

                dataset.append(data)

            except Exception as e:
                print(e)

    except Exception as e:
        print(page_no, e)

# quit() is a webdriver command which calls the driver.dispose method, which in turn closes all the browser windows and terminates the WebDriver session. If we do not use quit() at the end of program, the WebDriver session will not be closed properly and the files will not be cleared off memory. This may result in memory leak errors.
driver.quit()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  3.07it/s]


In [9]:

# dataframe
df1 = pd.DataFrame(dataset)

In [10]:
df1

Unnamed: 0,title,href,d1id,dirId,docId,qb,enc,section,rank,search_sort,spq
0,저렴한곳 추천좀 (여자친구선물해줄거임),https://kin.naver.com/qna/detail.naver?d1id=5&...,5,5010601,437814039,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,1,0,0
1,여자친구 선물,https://kin.naver.com/qna/detail.naver?d1id=5&...,5,5040302,437181994,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,2,0,0
2,20대여자친구 선물,https://kin.naver.com/qna/detail.naver?d1id=5&...,5,50402,437767955,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,3,0,0
3,여자친구 선물,https://kin.naver.com/qna/detail.naver?d1id=8&...,8,80101,437543960,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,4,0,0
4,여자친구 선물,https://kin.naver.com/qna/detail.naver?d1id=8&...,8,80101,437420908,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,5,0,0
5,"여자친구 선물을 미리 말하게된다면,",https://kin.naver.com/qna/detail.naver?d1id=8&...,8,80101,437789447,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,6,0,0
6,중2여자친구 선물,https://kin.naver.com/qna/detail.naver?d1id=8&...,8,8030205,437591042,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,7,0,0
7,20대 후반 여자 선물 골라주세요!,https://kin.naver.com/qna/detail.naver?d1id=8&...,8,80101,437852283,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,8,0,0
8,나이키 에어포스,https://kin.naver.com/qna/detail.naver?d1id=8&...,8,8040202,437573724,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,9,0,0
9,컨버스 척 70 로우 사이즈 5업 해도...,https://kin.naver.com/qna/detail.naver?d1id=8&...,8,8040202,437702098,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,10,0,0


In [14]:
# the old driver has been quit, so we need to init a new one

from selenium.webdriver.common.by import By

driver =webdriver.Chrome('chromedriver',options=options)
# series
dataset = []

for page_no in tqdm(df1.index):
    try:
        endpoint = df1['href'][page_no]
        
        print(endpoint)
        
        driver.get(endpoint)

        # parse
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # readmore
        num_of_more = int(soup.select_one('#nextPageButton ._currentAnswerCount').get_text(strip=True))
        total_more_count = int(soup.select_one('#nextPageButton ._totalAnswerCount').get_text(strip=True))
        total_more = math.floor(total_more_count / num_of_more)
        
        
        time.sleep(5)
        
        for idx in np.arange(0, total_more):
            driver.find_element(By.CSS_SELECTOR,"#nextPageButton").click()
            time.sleep(1)

        # data
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        menuitem = soup.select('.answer-content__list .answer-content__item')

        for item in menuitem:
            try:

                element = item.select_one('._endContentsText')
                content = element.get_text(strip=True)

                data = {}
                data.update({ 'content':content })

                for arr in re.sub('.*\?', '', endpoint).split('&'):
                    txt = arr.split('=')
                    data.update({ txt[0]:txt[1] })

                dataset.append(data)

            except Exception as e:
                print(e)

    except Exception as e:
        print(page_no, e)

# quit() is a webdriver command which calls the driver.dispose method, which in turn closes all the browser windows and terminates the WebDriver session. If we do not use quit() at the end of program, the WebDriver session will not be closed properly and the files will not be cleared off memory. This may result in memory leak errors.
driver.quit()

# dataframe
df2 = pd.DataFrame(dataset)

  0%|                                                                                                                                                                                          | 0/33 [00:00<?, ?it/s]

https://kin.naver.com/qna/detail.naver?d1id=5&dirId=5010601&docId=437814039&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=1&search_sort=0&spq=0


  3%|█████▍                                                                                                                                                                            | 1/33 [00:07<04:06,  7.72s/it]

0 Message: element not interactable
  (Session info: headless chrome=109.0.5414.119)
Stacktrace:
0   chromedriver                        0x000000010080f0fc chromedriver + 4223228
1   chromedriver                        0x0000000100797284 chromedriver + 3732100
2   chromedriver                        0x000000010044b448 chromedriver + 275528
3   chromedriver                        0x0000000100488660 chromedriver + 525920
4   chromedriver                        0x000000010047b824 chromedriver + 473124
5   chromedriver                        0x000000010047affc chromedriver + 471036
6   chromedriver                        0x00000001004bf360 chromedriver + 750432
7   chromedriver                        0x0000000100479748 chromedriver + 464712
8   chromedriver                        0x000000010047a7f0 chromedriver + 468976
9   chromedriver                        0x00000001007dfe08 chromedriver + 4029960
10  chromedriver                        0x00000001007e3698 chromedriver + 4044440
11  chro

  6%|██████████▊                                                                                                                                                                       | 2/33 [00:16<04:13,  8.19s/it]

https://kin.naver.com/qna/detail.naver?d1id=5&dirId=50402&docId=437767955&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=3&search_sort=0&spq=0


  9%|████████████████▏                                                                                                                                                                 | 3/33 [00:21<03:30,  7.00s/it]

2 Message: element not interactable
  (Session info: headless chrome=109.0.5414.119)
Stacktrace:
0   chromedriver                        0x000000010080f0fc chromedriver + 4223228
1   chromedriver                        0x0000000100797284 chromedriver + 3732100
2   chromedriver                        0x000000010044b448 chromedriver + 275528
3   chromedriver                        0x0000000100488660 chromedriver + 525920
4   chromedriver                        0x000000010047b824 chromedriver + 473124
5   chromedriver                        0x000000010047affc chromedriver + 471036
6   chromedriver                        0x00000001004bf360 chromedriver + 750432
7   chromedriver                        0x0000000100479748 chromedriver + 464712
8   chromedriver                        0x000000010047a7f0 chromedriver + 468976
9   chromedriver                        0x00000001007dfe08 chromedriver + 4029960
10  chromedriver                        0x00000001007e3698 chromedriver + 4044440
11  chro

 12%|█████████████████████▌                                                                                                                                                            | 4/33 [00:29<03:26,  7.10s/it]

https://kin.naver.com/qna/detail.naver?d1id=8&dirId=80101&docId=437420908&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=5&search_sort=0&spq=0


 15%|██████████████████████████▉                                                                                                                                                       | 5/33 [00:35<03:15,  6.97s/it]

https://kin.naver.com/qna/detail.naver?d1id=8&dirId=80101&docId=437789447&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=6&search_sort=0&spq=0


 18%|████████████████████████████████▎                                                                                                                                                 | 6/33 [00:41<02:59,  6.64s/it]

https://kin.naver.com/qna/detail.naver?d1id=8&dirId=8030205&docId=437591042&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=7&search_sort=0&spq=0


 21%|█████████████████████████████████████▊                                                                                                                                            | 7/33 [00:47<02:48,  6.47s/it]

https://kin.naver.com/qna/detail.naver?d1id=8&dirId=80101&docId=437852283&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=8&search_sort=0&spq=0


 24%|███████████████████████████████████████████▏                                                                                                                                      | 8/33 [00:53<02:37,  6.29s/it]

https://kin.naver.com/qna/detail.naver?d1id=8&dirId=8040202&docId=437573724&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=9&search_sort=0&spq=0


 27%|████████████████████████████████████████████████▌                                                                                                                                 | 9/33 [00:59<02:27,  6.13s/it]

https://kin.naver.com/qna/detail.naver?d1id=8&dirId=8040202&docId=437702098&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=10&search_sort=0&spq=0


 30%|█████████████████████████████████████████████████████▋                                                                                                                           | 10/33 [01:05<02:16,  5.95s/it]

https://kin.naver.com/qna/detail.naver?d1id=5&dirId=5040107&docId=437667296&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=11&search_sort=0&spq=0


 33%|███████████████████████████████████████████████████████████                                                                                                                      | 11/33 [01:10<02:08,  5.85s/it]

https://kin.naver.com/qna/detail.naver?d1id=5&dirId=5040107&docId=437667130&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=12&search_sort=0&spq=0


 36%|████████████████████████████████████████████████████████████████▎                                                                                                                | 12/33 [01:16<02:01,  5.77s/it]

https://kin.naver.com/qna/detail.naver?d1id=2&dirId=20902&docId=437448833&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=13&search_sort=0&spq=0


 39%|█████████████████████████████████████████████████████████████████████▋                                                                                                           | 13/33 [01:21<01:54,  5.71s/it]

https://kin.naver.com/qna/detail.naver?d1id=8&dirId=80107&docId=437900876&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=14&search_sort=0&spq=0


 42%|███████████████████████████████████████████████████████████████████████████                                                                                                      | 14/33 [01:27<01:49,  5.76s/it]

https://kin.naver.com/qna/detail.naver?d1id=5&dirId=5010601&docId=437847491&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=15&search_sort=0&spq=0


 45%|████████████████████████████████████████████████████████████████████████████████▍                                                                                                | 15/33 [01:33<01:42,  5.70s/it]

https://kin.naver.com/qna/detail.naver?d1id=2&dirId=20208&docId=437920640&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=16&search_sort=0&spq=0


 48%|█████████████████████████████████████████████████████████████████████████████████████▊                                                                                           | 16/33 [01:38<01:35,  5.63s/it]

https://kin.naver.com/qna/detail.naver?d1id=5&dirId=50401&docId=436946728&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=17&search_sort=0&spq=0


 52%|███████████████████████████████████████████████████████████████████████████████████████████▏                                                                                     | 17/33 [01:44<01:29,  5.62s/it]

https://kin.naver.com/qna/detail.naver?d1id=5&dirId=50302&docId=437602260&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=18&search_sort=0&spq=0


 55%|████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                | 18/33 [01:50<01:26,  5.77s/it]

https://kin.naver.com/qna/detail.naver?d1id=5&dirId=50402&docId=437423767&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=19&search_sort=0&spq=0


 58%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                           | 19/33 [01:56<01:21,  5.85s/it]

18 Message: element not interactable
  (Session info: headless chrome=109.0.5414.119)
Stacktrace:
0   chromedriver                        0x000000010080f0fc chromedriver + 4223228
1   chromedriver                        0x0000000100797284 chromedriver + 3732100
2   chromedriver                        0x000000010044b448 chromedriver + 275528
3   chromedriver                        0x0000000100488660 chromedriver + 525920
4   chromedriver                        0x000000010047b824 chromedriver + 473124
5   chromedriver                        0x000000010047affc chromedriver + 471036
6   chromedriver                        0x00000001004bf360 chromedriver + 750432
7   chromedriver                        0x0000000100479748 chromedriver + 464712
8   chromedriver                        0x000000010047a7f0 chromedriver + 468976
9   chromedriver                        0x00000001007dfe08 chromedriver + 4029960
10  chromedriver                        0x00000001007e3698 chromedriver + 4044440
11  chr

 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                     | 20/33 [02:02<01:14,  5.75s/it]

https://kin.naver.com/qna/detail.naver?d1id=5&dirId=5040302&docId=437386020&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=21&search_sort=0&spq=0


 64%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                | 21/33 [02:07<01:08,  5.74s/it]

https://kin.naver.com/qna/detail.naver?d1id=8&dirId=80101&docId=437586188&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=22&search_sort=0&spq=0


 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                           | 22/33 [02:15<01:07,  6.16s/it]

https://kin.naver.com/qna/detail.naver?d1id=5&dirId=5040201&docId=437316911&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=23&search_sort=0&spq=0


 70%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                     | 23/33 [02:20<00:59,  5.99s/it]

https://kin.naver.com/qna/detail.naver?d1id=8&dirId=8020306&docId=437615253&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=24&search_sort=0&spq=0


 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                | 24/33 [02:26<00:52,  5.87s/it]

https://kin.naver.com/qna/detail.naver?d1id=8&dirId=8040204&docId=437617221&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=25&search_sort=0&spq=0


 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                           | 25/33 [02:31<00:46,  5.80s/it]

https://kin.naver.com/qna/detail.naver?d1id=8&dirId=80107&docId=437796760&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=26&search_sort=0&spq=0


 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                     | 26/33 [02:39<00:43,  6.22s/it]

https://kin.naver.com/qna/detail.naver?d1id=8&dirId=80107&docId=437726720&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=27&search_sort=0&spq=0


 82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                | 27/33 [02:46<00:39,  6.53s/it]

https://kin.naver.com/qna/detail.naver?d1id=8&dirId=80101&docId=437741360&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=28&search_sort=0&spq=0


 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                          | 28/33 [02:53<00:33,  6.69s/it]

'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
https://kin.naver.com/qna/detail.naver?d1id=8&dirId=8040202&docId=436568033&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=29&search_sort=0&spq=0


 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 29/33 [02:59<00:25,  6.40s/it]

28 Message: element not interactable
  (Session info: headless chrome=109.0.5414.119)
Stacktrace:
0   chromedriver                        0x000000010080f0fc chromedriver + 4223228
1   chromedriver                        0x0000000100797284 chromedriver + 3732100
2   chromedriver                        0x000000010044b448 chromedriver + 275528
3   chromedriver                        0x0000000100488660 chromedriver + 525920
4   chromedriver                        0x000000010047b824 chromedriver + 473124
5   chromedriver                        0x000000010047affc chromedriver + 471036
6   chromedriver                        0x00000001004bf360 chromedriver + 750432
7   chromedriver                        0x0000000100479748 chromedriver + 464712
8   chromedriver                        0x000000010047a7f0 chromedriver + 468976
9   chromedriver                        0x00000001007dfe08 chromedriver + 4029960
10  chromedriver                        0x00000001007e3698 chromedriver + 4044440
11  chr

 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 30/33 [03:04<00:18,  6.18s/it]

https://kin.naver.com/qna/detail.naver?d1id=8&dirId=80107&docId=437791300&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=31&search_sort=0&spq=0


 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 31/33 [03:11<00:13,  6.50s/it]

https://kin.naver.com/qna/detail.naver?d1id=8&dirId=80603&docId=437433089&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=32&search_sort=0&spq=0


 97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 32/33 [03:17<00:06,  6.33s/it]

https://kin.naver.com/qna/detail.naver?d1id=8&dirId=80101&docId=437635617&qb=7Jes7J6Q7Lmc6rWsIOyEoOusvA==&enc=utf8&section=kin.qna&rank=33&search_sort=0&spq=0


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 33/33 [03:23<00:00,  6.17s/it]


In [12]:
df2

Unnamed: 0,content,d1id,dirId,docId,qb,enc,section,rank,search_sort,spq
0,서프라이즈로 하세요그게 감동 두배입니다,8,80101,437789447,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,6,0,0
1,당연히 서프라이즈해야죠.안그럼..그날 또 다른 무언가를 사야해요...ㅎㅎㅎㅎ,8,80101,437789447,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,6,0,0
2,오면 주는게 좋죠 ㅎㅎ 서프라이즈,8,80101,437789447,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,6,0,0
3,안녕하세요!연애 전문 어썸미 상담소입니다 :)​미리 상의하지 않고 준비하신 선물이라...,8,80101,437789447,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,6,0,0
4,중학생입장에서 좀 부담스러울 수도 있을 것 같아요 향수나 핸드크림은 어떤가요,8,8030205,437591042,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,7,0,0
5,안녕하세요​좀 부담스럽게 느낄수 있을꺼 같네요​목걸이를 선물 해보시는건 어떨까요?​...,8,8030205,437591042,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,7,0,0
6,​​​​예쁜 악세사리 선물은 어떠신가요?​​​​준샵 목걸이 2가지 추천합니다.​​​...,8,8030205,437591042,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,7,0,0
7,파이토젠 퀸플러스 좋아요​여자친구가 생리통이 심해서 걱정하며사줬던 영양제 선물인데생...,8,80101,437852283,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,8,0,0
8,같은 제품이지만 하단에 있는 제품명 앞에(W)는 여성용입니다.그러므로 하단의 제품으...,8,8040202,437573724,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,9,0,0
9,넵 컨버스는 다른 신발들 보다 작게 나왔다고 느꼈고​그래서 최소 반업은 하시는 것을...,8,8040202,437702098,7Jes7J6Q7Lmc6rWsIOyEoOusvA,utf8,kin.qna,10,0,0


In [13]:
# export
df = pd.merge(df1, df2, on='docId', how='right')
df = df[['docId', 'title', 'content', 'href']]
df

Unnamed: 0,docId,title,content,href
0,437789447,"여자친구 선물을 미리 말하게된다면,",서프라이즈로 하세요그게 감동 두배입니다,https://kin.naver.com/qna/detail.naver?d1id=8&...
1,437789447,"여자친구 선물을 미리 말하게된다면,",당연히 서프라이즈해야죠.안그럼..그날 또 다른 무언가를 사야해요...ㅎㅎㅎㅎ,https://kin.naver.com/qna/detail.naver?d1id=8&...
2,437789447,"여자친구 선물을 미리 말하게된다면,",오면 주는게 좋죠 ㅎㅎ 서프라이즈,https://kin.naver.com/qna/detail.naver?d1id=8&...
3,437789447,"여자친구 선물을 미리 말하게된다면,",안녕하세요!연애 전문 어썸미 상담소입니다 :)​미리 상의하지 않고 준비하신 선물이라...,https://kin.naver.com/qna/detail.naver?d1id=8&...
4,437591042,중2여자친구 선물,중학생입장에서 좀 부담스러울 수도 있을 것 같아요 향수나 핸드크림은 어떤가요,https://kin.naver.com/qna/detail.naver?d1id=8&...
5,437591042,중2여자친구 선물,안녕하세요​좀 부담스럽게 느낄수 있을꺼 같네요​목걸이를 선물 해보시는건 어떨까요?​...,https://kin.naver.com/qna/detail.naver?d1id=8&...
6,437591042,중2여자친구 선물,​​​​예쁜 악세사리 선물은 어떠신가요?​​​​준샵 목걸이 2가지 추천합니다.​​​...,https://kin.naver.com/qna/detail.naver?d1id=8&...
7,437852283,20대 후반 여자 선물 골라주세요!,파이토젠 퀸플러스 좋아요​여자친구가 생리통이 심해서 걱정하며사줬던 영양제 선물인데생...,https://kin.naver.com/qna/detail.naver?d1id=8&...
8,437573724,나이키 에어포스,같은 제품이지만 하단에 있는 제품명 앞에(W)는 여성용입니다.그러므로 하단의 제품으...,https://kin.naver.com/qna/detail.naver?d1id=8&...
9,437702098,컨버스 척 70 로우 사이즈 5업 해도...,넵 컨버스는 다른 신발들 보다 작게 나왔다고 느꼈고​그래서 최소 반업은 하시는 것을...,https://kin.naver.com/qna/detail.naver?d1id=8&...
