# Import Libraries

In [1]:
# Import the beautifulsoup 
# and request libraries of python.

import os

import requests
import bs4

from datetime import datetime
from time import sleep
from random import randint

import pandas as pd
import numpy as np

# functions which is special for this notebook
from string_operation import StringOperation

# Scrapping Section

We will scrap a search result from google for 500 content

In [2]:
input_text = 'linux'
search_text = input_text.replace(' ','+')

# define content_id
n_content = 500
# n content in google search per page is 10 contents
content_ids = np.arange(0, n_content, 10)

list_soup = []

# get start time
start_time = datetime.now()
for i, start_id in enumerate(content_ids):
    url = f'https://www.google.com/search?q={search_text}&start={start_id}'

    # Fetch the URL data
    request_result=requests.get( url )
    
    print(f'scrapping for page {i+1}, request status: {request_result} ...')

    # Creating soup from the fetched request
    soup = bs4.BeautifulSoup(request_result.text,
                             "html.parser")
    
    # append result per peage
    list_soup.append(soup)
    
    # pause search for evry iteration with random number
    # the more number the more delay
    # it is usefull to avoid security issue
    sleep(randint(2,10))
    
# get finish time
finish_time = datetime.now()
print('\nexecution time completed:', finish_time-start_time)

scrapping for page 1, request status: <Response [200]> ...
scrapping for page 2, request status: <Response [200]> ...
scrapping for page 3, request status: <Response [200]> ...
scrapping for page 4, request status: <Response [200]> ...
scrapping for page 5, request status: <Response [200]> ...
scrapping for page 6, request status: <Response [200]> ...
scrapping for page 7, request status: <Response [200]> ...
scrapping for page 8, request status: <Response [200]> ...
scrapping for page 9, request status: <Response [200]> ...
scrapping for page 10, request status: <Response [200]> ...
scrapping for page 11, request status: <Response [200]> ...
scrapping for page 12, request status: <Response [200]> ...
scrapping for page 13, request status: <Response [200]> ...
scrapping for page 14, request status: <Response [200]> ...
scrapping for page 15, request status: <Response [200]> ...
scrapping for page 16, request status: <Response [200]> ...
scrapping for page 17, request status: <Response 

all responses code are 200, so the get request is complete for all pages

# Dataframe from Scrapping Result

In [3]:
title_list = []
url_list = []
page_list = []
rank_list = []

# initial page
web_page = 1
rank_item = 1

# loop for all result
for i, soup in enumerate(list_soup):

    # get links in website page
    links = soup.find_all("a")

    # get url and website title
    for link in links:
        link_href = link.get('href')
        
        if ("url?q=" in link_href) and (not "webcache" in link_href):
            # get content title
            title = link.find_all('h3')
            
            if len(title) > 0:
                url_web = link.get('href').split("?q=")[1].split("&sa=U")[0]
                title_web = title[0].getText()
                
                title_list.extend([title_web])
                url_list.extend([url_web])
                page_list.extend([web_page])
                rank_list.extend([rank_item])
                
                rank_item+=1
    web_page+=1

In [4]:
# create dataframe
df_result = pd.DataFrame({'page number':page_list,
                   'rank':rank_list,
                   'url':url_list,
                   "original title":title_list})

# insert url doman
df_result.insert(loc=df_result.columns.get_loc('url'), column='domain', 
                 value=df_result['url'].str.split('/').str[2])

# translate title into english
df_result['english title'] = StringOperation.word_translation(df_result['original title'], src_lang='id', dest_lang='en')

display(df_result)

Unnamed: 0,page number,rank,domain,url,original title,english title
0,1,1,www.linux.org,https://www.linux.org/,Linux.org,Linux.org
1,1,2,id.wikipedia.org,https://id.wikipedia.org/wiki/Linux,"Linux - Wikipedia bahasa Indonesia, ensikloped...","Linux - Wikipedia Indonesian, the free encyclo..."
2,1,3,tekno.kompas.com,https://tekno.kompas.com/read/2022/07/27/12150...,"Apa Itu Linux? Mengenal Fungsi, Sejarah, serta...","What Is Linux? Get to know the Functions, Hist..."
3,1,4,diskominfo.kedirikab.go.id,https://diskominfo.kedirikab.go.id/baca/apa-it...,Apa itu Linux? - Kominfo Kab Kediri,What is Linux? - Kominfo Kediri District
4,1,5,ubuntu.com,https://ubuntu.com/,Ubuntu: Enterprise Open Source and Linux,Ubuntu: Enterprise Open Source and Linux
...,...,...,...,...,...,...
212,22,213,www.filemagz.com,https://www.filemagz.com/os-linux-terbaik/,"OS Linux Terbaik untuk Kamu Coba, Apa Saja Ya?...","The Best Linux OS for You to Try, What Are The..."
213,23,214,commons.wikimedia.org,https://commons.wikimedia.org/wiki/File:Linux_...,File:Linux Mint 20.3 (Una) Cinnamon.png - Wiki...,MYMEMORY WARNING: YOU USED ALL AVAILABLE FREE ...
214,23,215,www.linuxadictos.com,https://www.linuxadictos.com/en/linux-mint-20-...,"Linux Mint 20.3 now available, with Linux 5.4 ...",MYMEMORY WARNING: YOU USED ALL AVAILABLE FREE ...
215,23,216,www.plex.tv,https://www.plex.tv/media-server-downloads/,Media Server Downloads - Plex,MYMEMORY WARNING: YOU USED ALL AVAILABLE FREE ...


In [None]:
df_result.loc[216, 'english title']



## Save Result

In [5]:
# save data
file_path = os.path.join('dataset', "scrapping_result.csv")
df_result.to_csv(file_path, sep=',', index=False, quoting=1)