# Goal: Find data source

### Requests and BS
https://stackoverflow.com/questions/61064420/how-to-print-the-number-of-google-search-results-beautifulsoup

In [70]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import yaml
from datetime import datetime

# load settings.yml
with open(r'../settings.yml') as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    settings = yaml.full_load(file)

    PROJECT_DIR = settings['project']['root_dir']
    KEYWORDS    = settings['query']['keywords']
    USER_AGENT  = settings['query']['user_agent']
    GOOGLE_URL  = settings['query']['google_url']

print("Project dir\t{}\nKeywords\t{}".format(PROJECT_DIR, KEYWORDS))

Project dir	C:/Users/phili/Google Drive/Projekter/Google keywords/google_results_count
Keywords	['ESG', 'sustainable finance', 'responsible investment', 'impact investing', 'green finance', 'sustainable investment', 'socially responsible investment']


In [55]:
def get_search_urls(keyword_list, url="https://www.google.com/search?q="):
    """ Compose search urls """
    search_query = [kw.replace(' ','+') for kw in keyword_list] # replace space with '+'
    return [url+sq for sq in search_query]
    
def get_results_count(keyword, user_agent):
    result = requests.get(keyword, headers=user_agent)    
    soup = BeautifulSoup(result.content, 'html.parser')
    
    #  string that contains results count 'About 1,410,000,000 results'
    total_results_text = soup.find("div", {"id": "result-stats"}).find(text=True, recursive=False) 
    
    # extract number
    results_num = int(''.join([num for num in total_results_text if num.isdigit()]) )
    
    return results_num

def df_build_results_count(keyword_list, user_agent, url="https://www.google.com/search?q="):
    search_urls = get_search_urls(keyword_list)
    result_count = [get_results_count(url, user_agent) for url in search_urls]  
    timestamp = datetime.now()
    
    df = pd.DataFrame({'keyword': keyword_list, 
                       'results_count': result_count, 
                       'search_url': search_urls, 
                       'query_timestamp': timestamp})
    
    return df

df = df_build_results_count(KEYWORDS, USER_AGENT, url=GOOGLE_URL)

# Dump

### google-search

https://github.com/anthonyhseb/googlesearch

In [None]:
from googlesearch.googlesearch import GoogleSearch
response = GoogleSearch().search("ESG", num_results = 1000)

for result in response.results:
    print("Title: " + result.title)
    print("Count: " + result.total)

In [67]:
pd.DataFrame([i.__dict__ for i in response.results])

Unnamed: 0,title,url,_SearchResult__text,_SearchResult__markup
0,"Environmental, social and corporate governance...","https://en.wikipedia.org/wiki/Environmental,_s...","\n\n\n\n\n\n\n\n\nEnvironmental, social and co...","b'<!DOCTYPE html>\n<html class=""client-nojs"" l..."
1,"Environmental, Social, and Governance (ESG) Cr...",https://www.investopedia.com/terms/e/environme...,\n\n\n\n\n\n\n \n\n \n\n \n\n \n\n\n\n\n\n\n \...,"b'<!DOCTYPE html>\n<html id=""termscTemplate_1-..."
2,ESG framework | McKinsey,https://www.mckinsey.com/business-functions/st...,\n\n\n\n\n\n\n\nWe use cookies essential for t...,"b'<!DOCTYPE html>\r\n<html dir="""" lang=""en""\r\..."
3,Startseite - ESG - Evangelische Studierendenge...,https://esgberlin.de/,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"b'<!DOCTYPE html>\n<html lang=""de"">\n<head>\n<..."
4,ESG Investing and Analysis - CFA Institute,https://www.cfainstitute.org/en/research/esg-i...,"\n\n\n\n\n\n\nWe’re using cookies, but you can...","b'\r\n<!DOCTYPE html>\r\n<html lang=""en-US"" xm..."
5,What is ESG - MSCI,https://www.msci.com/what-is-esg,\n\n\n \n\nNavigation\n\nSkip to Content\n\n\...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
6,ESG-Kriterien • Definition | Gabler Wirtschaft...,https://wirtschaftslexikon.gabler.de/definitio...,\n\n\n\n Direkt zum Inhalt\n \nGoogle ...,"b'<!DOCTYPE html>\n<html lang=""de"" dir=""ltr"" x..."
7,Definition: Was bedeutet ESG (Environmental So...,https://www.euramco-asset.de/glossar/environme...,\n\n\n ​\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"b'<!DOCTYPE html>\n<html lang=""de"">\n<head>\n ..."
8,The Remarkable Rise Of ESG - Forbes,https://www.forbes.com/sites/georgkell/2018/07...,ExploreBillionairesAll BillionairesWorld's Bil...,"b'<!DOCTYPE html><html lang=""en""><head><title>..."


In [71]:
response.total

78

## Future extensions

#### Result texts Create content analysis



In [63]:
response.results[0].get_text()

