# Scraping Google News

We can use google news result to see what is happening with a certain keyword.
For example it can be used to verify potential merchant for businesses

In [1]:
import crawler as cw
import pandas as pd
from bs4 import BeautifulSoup as bs

import time

In [2]:
DRIVER = cw.initiate_crawler()

Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36


In [3]:
def get_soup_news(driver, query):
    news_search_url = 'https://www.google.com/search?q="'+str(query) +'"&tbm=nws'

    driver.get(news_search_url)
    driver.set_page_load_timeout(10)

    soup = bs(driver.page_source, 'lxml')
    return soup

In [4]:
def extract_info(soup):
    news = soup.find_all('a', attrs={'class': 'WlydOe'}, href=True)
    title_class = ["mCBkyc y355M JQe2Ld nDgy9d", "mCBkyc y355M nDgy9d"]
    desc_class="GI74Re nDgy9d" #desc
    
    df = pd.DataFrame()
    links, titles, descs = [], [], []
    for n in news:
        links.append(n['href'])
        
        title = n.find('div', attrs={'class': title_class[0]})
        if(title):
            title = title.get_text()
        else:
            title = n.find('div', attrs={'class': title_class[1]}).get_text()
        
        titles.append(title)
        
        des = n.find('div', attrs={'class': desc_class})
        if(des):
            des = des.get_text()
        else:
            des = "none"
            
        descs.append(des)
        
    
    df['title'] = titles
    df['desc'] = descs
    df['link'] = links
    
    return df

## Testing

In [5]:
news_soup = get_soup_news(DRIVER, "motogp mandalika")

In [6]:
extract_info(news_soup)

Unnamed: 0,title,desc,link
0,5 Keunggulan Sirkuit Mandalika dari Negara Lai...,KEUNGGULAN Sirkuit Mandalika dari negara lain ...,https://sports.okezone.com/read/2022/03/18/38/...
1,"MotoGP Mandalika, Marc Marquez Ungkap Permasal...",Marc Marquez mengungkapkan permasalahan yang d...,https://www.kompas.com/motogp/read/2022/03/19/...
2,Hasil FP3 MotoGP Mandalika: Marc Marquez Kuasa...,Marc Marquez menguasai Free Practice 3 Pertami...,https://sport.detik.com/moto-gp/d-5990775/hasi...
3,Kecelakaan Parah di Latihan Bebas 2 MotoGP Man...,Marc Marquez mengalami kecelakaan parah saat m...,https://sports.okezone.com/read/2022/03/19/38/...
4,VIDEO: Motor Quartararo Mendadak Mogok di FP2 ...,"Pembalap Yamaha, Fabio Quartararo mengalami mo...",https://www.cnnindonesia.com/olahraga/20220319...
5,Kritik Tajam Jelang Kualifikasi MotoGP Mandali...,Alex Rins menyebut Sirkuit Mandalika yang meru...,https://sports.okezone.com/read/2022/03/18/38/...
6,Nonton Live Streaming MotoGP Mandalika 2022 Gr...,Ini dia link untuk nonton live streaming MotoG...,https://www.motorplus-online.com/read/25319206...
7,Kisah Pawang Hujan MotoGP Mandalika 2022,Untuk mengendalikan hujan dan panas dalam ajan...,https://www.cnnindonesia.com/olahraga/20220318...
8,Menilik Trofi MotoGP Mandalika Karya Anak Bang...,Menilik trofi-trofi MotoGP Mandalika 2022 yang...,https://www.bola.net/otomotif/menilik-trofi-mo...
9,Awas! Calo Tiket MotoGP Mandalika Berkeliaran ...,none,https://otomotif.tempo.co/read/1572602/awas-ca...


Looks okay, but we can only get first page result. Let's try to handle multiple pages, normal scenario first.

## Handling multiple pages 

In [7]:
def get_soup_multiple(driver, query, pages):
    soups = []
    
    for i in range(pages):
        search = 'https://www.google.com/search?q="'+str(query) +'"&tbm=nws&start='+str(i*10)

        driver.get(search)
        driver.set_page_load_timeout(10)
        
        if(pages%5==0):
            time.sleep(2)
        else:
            time.sleep(0.1)

        soup = bs(driver.page_source, 'lxml')
        soups.append(soup)
        
    return soups

In [8]:
def extract_info_multiple(soup):
    news = soup.find_all('a', attrs={'class': 'WlydOe'}, href=True)
    desc_class="GI74Re nDgy9d"
    
    df = pd.DataFrame()
    links, titles, descs = [], [], []
    for n in news:
        links.append(n['href'])
        
        # using css selector instead, because class name keeps changing
        title = n.select('div[class*="mCBkyc"]')[0]
        if(title):
            title = title.get_text()
        else:
            title = ""
            
        titles.append(title)
        
        des = n.find('div', attrs={'class': desc_class})
        if(des):
            des = des.get_text()
        else:
            des = "none"
            
        descs.append(des)
        
    
    df['title'] = titles
    df['desc'] = descs
    df['link'] = links
    
    return df

In [9]:
soups = get_soup_multiple(DRIVER, 'motogp mandalika', 10)

In [10]:
list_info = []
for soup in soups:
    info = extract_info_multiple(soup)
    list_info.append(info)

all_result = pd.concat(list_info, ignore_index=True)

In [11]:
all_result

Unnamed: 0,title,desc,link
0,5 Keunggulan Sirkuit Mandalika dari Negara Lai...,KEUNGGULAN Sirkuit Mandalika dari negara lain ...,https://sports.okezone.com/read/2022/03/18/38/...
1,"MotoGP Mandalika, Marc Marquez Ungkap Permasal...",Marc Marquez mengungkapkan permasalahan yang d...,https://www.kompas.com/motogp/read/2022/03/19/...
2,Hasil FP3 MotoGP Mandalika: Marc Marquez Kuasa...,Marc Marquez menguasai Free Practice 3 Pertami...,https://sport.detik.com/moto-gp/d-5990775/hasi...
3,Kecelakaan Parah di Latihan Bebas 2 MotoGP Man...,Marc Marquez mengalami kecelakaan parah saat m...,https://sports.okezone.com/read/2022/03/19/38/...
4,VIDEO: Motor Quartararo Mendadak Mogok di FP2 ...,"Pembalap Yamaha, Fabio Quartararo mengalami mo...",https://www.cnnindonesia.com/olahraga/20220319...
...,...,...,...
97,"Tiket MotoGP Mandalika Ludes Terjual, Tinggal ...","Tiket MotoGP Mandalika Ludes Terjual, Tinggal ...",https://www.bola.com/moto-gp/read/4913285/tike...
98,Jadwal MotoGP Mandalika 2022 Indonesia: Jadila...,MotoGP Mandalika 2022 ini bisa disaksikan live...,https://www.bola.net/otomotif/jadwal-motogp-ma...
99,Jadwal Lengkap MotoGP Mandalika 2022: Akhir Pe...,Jadwal Lengkap MotoGP Mandalika 2022: Akhir Pe...,https://www.liputan6.com/bola/read/4912399/jad...
100,Jadwal Siaran & Jam Tayang MotoGP Mandalika 20...,Jadwal Live Streaming MotoGP Mandalika 2022 vi...,https://banjarmasin.tribunnews.com/2022/03/16/...


In [12]:
cw.shutdown_crawler(DRIVER)

### Cautions

After continuous requests, browser might detect suspicious behaviour and request to do capcha verification. If this happen, stop scrapping for a while or use proxy.