# Scraping Project

## Connecting to Google Trends
This code connects to google trends and created a function to convert terms to topic ids to perform topic search when possible.

In [2]:
#This lines connect to Google Trends
from pytrends.request import TrendReq
import pandas as pd
import re
from itertools import product
import requests
import time

#Sets up language to host language
pytrends = TrendReq(hl='en-US')
#function takes as input a keyword and returns topic id
def getTopicID(word):
    #get suggested searches for word
    suggs = pytrends.suggestions(word)
    #check each suggestion and see if contains a topic
    for s in range(len(suggs)):
        #if the type of suggestion is a topic, return the topic id
        pattern = suggs[s].get("title").lower() + "(s|es|os)"
        if suggs[s].get("type") == "Topic" and (suggs[s].get("title").lower() == word.lower() or re.match(pattern, word.lower() )): 
            return(suggs[s].get("mid"))
    #returns None if there is no topic id
    return word
     

Here we choose the payload parameters for the request to be sent to the server. There are 5 different inputs to the payload like the original platform. Wrapped it in a function to get table for all topics and comparisons.

In [4]:
#function to get interest over time depending on the region
def get_interest_over_time(kw_list, kw_comp_list, timeframes, ct, geo, gprop):
    #converts kw to ids to do topic look up
    topicID_list = [getTopicID(kw) for kw in kw_list]
    comp_topicID = [getTopicID(kw) for kw in kw_comp_list] 
    #keyword dictionary used to rename columns
    #keys are ids, values are keywords
    kw_dict = dict(zip(topicID_list, kw_list))
    kw_dict.update(dict(zip(comp_topicID, kw_comp_list)))
    objective_df = pd.DataFrame()
    all_columns = topicID_list + [e for e in product(comp_topicID, topicID_list)]  
    for column in all_columns:
        term = [column] if isinstance(column, str) else column
        #catches exception when there is a timeout on google trends
        #it's useful to continue running after timeout
        try:
            pytrends.build_payload(
                kw_list= term,
                cat = 0,
                timeframe = timeframes,
                geo = "IT",
                gprop = ""
                    )
            time.sleep(0) #wait some seconds before sending the next request
            #for a larger number of requests it should wait 60s
        except requests.exceptions.Timeout: 
            print("Timeout ocurred")
        partial_data = pytrends.interest_over_time() #data per column
        partial_data = partial_data.drop("isPartial", axis = 1) #deletes column that has partial data
        #change column name 
        #2 cases
        #2 columns
        #if else statement to determine how to name columns
        if(len(partial_data.columns) > 1):
            #change column name to term_term2
            partial_data.rename(columns = kw_dict, inplace=True)
            new_col_names = {partial_data.columns[0]:(partial_data.columns[0] + "_" + partial_data.columns[1]),
                            partial_data.columns[1]:(partial_data.columns[1] + "_" + partial_data.columns[0])}
            partial_data.rename(columns = new_col_names, inplace=True)
        if(len(partial_data.columns) == 1):
            #change column name to term2_term
            partial_data.rename(columns = kw_dict, inplace=True)
        #1 columns  
        objective_df = pd.concat([objective_df, partial_data], axis=1)
    #changes formatting of date
    objective_df.index = objective_df.index.to_period("M")
    return(objective_df)
#italy dataframe
#function accepts the following arguments
#list of keywords used in the search
kw_list = ["Global warming", "climate change", "greenhouse gas", "renewable energy", 
            "sustainability", "Climate disaster", "green energy", "green investment",
            "green production"]  
kw_comp_list = ["Job", "health", "education", "drugs"]
#time frame
timeframes = '2010-01-01 2021-12-31' 
#category 
ct = 0 #means all categories
#geographical location
geo = 'IT' #Italy
gprop = '' #web search


Gets national data for italy.

In [None]:
italy_2010_monthly_df = get_interest_over_time(kw_list, kw_comp_list, timeframes, ct, geo, gprop)
#italy monthly 2011 - 2021
timeframes = '2011-01-01 2021-12-31' 
italy_2011_monthly_df = get_interest_over_time(kw_list, kw_comp_list, timeframes, ct, geo, gprop)



Now that the data frame was created. Write into csv file.

In [None]:
#write to csv file
italy_2010_monthly_df.to_csv("italy_2010_monthly_interest_over_time.csv")
italy_2011_monthly_df.to_csv("italy_2011_monthly_interest_over_time.csv")


Creates quarterly data

In [None]:
def to_quartertly(df):
    grouper = df.groupby([pd.Grouper(freq='Q')])
    region_df_quarter = grouper.mean().reset_index()
    #format date to look decent
    return(region_df_quarter)
italy_2010_quarterly_df = to_quartertly(italy_2010_monthly_df)
italy_2011_quarterly_df = to_quartertly(italy_2011_monthly_df)


Write quarterly data into csv file.

In [None]:
italy_2010_quarterly_df.to_csv("italy_2010_quarterly_interest_over_time.csv", index=False)
italy_2011_quarterly_df.to_csv("italy_2011_quarterly_interest_over_time.csv", index=False)

Now we are getting for each region.

In [None]:
def get_regional_data(kw_list, kw_comp_list, timeframes, ct, geo, gprop):
    #get regions
    pytrends.build_payload(
        kw_list= [kw_list[0]],
        cat = 0,
        timeframe = timeframes,
        geo = "IT",
        gprop = ""
        )
    #returns region ISO code
    regions = pytrends.interest_by_region(inc_geo_code=True)
    #extract region ISO code from df
    geos = regions.geoCode
    #create regional data frame
    region_df = pd.DataFrame()
    #gets interest over time for each region and then appends them together
    for i in range(len(geos)):
        regions = get_interest_over_time(kw_list, kw_comp_list, timeframes, ct, geos[i], gprop)
        regions.insert(loc=0, column="Region", value=geos.index[i])
        region_df = region_df.append(regions)
    return(region_df)


Interest over time for every italy region starting from 2010.

In [None]:
timeframes = '2010-01-01 2021-12-31' 
italy_region_2010_monthly_interest_over_time = get_regional_data(kw_list, kw_comp_list, timeframes, ct, geo, gprop)


Swaps columns to place region before date.

In [None]:
#swap columns so date appears before region
copy_italy_region_2010 = italy_region_2010_monthly_interest_over_time.copy()
copy_italy_region_2010.insert(1, 'date', copy_italy_region_2010.index)
copy_italy_region_2010.reset_index(drop = True)



In [None]:
#writes result to csv file
copy_italy_region_2010.to_csv("italy_region_2010_monthly_interest_over_time.csv", index=False)

In [None]:
timeframes = '2011-01-01 2021-12-31' 
italy_region_2011_monthly_interest_over_time = get_regional_data(kw_list, kw_comp_list, timeframes, ct, geo, gprop)

Swaps columns to place date after region.

In [None]:
copy_italy_region_2011 = italy_region_2011_monthly_interest_over_time.copy()
copy_italy_region_2011.insert(1, 'date', copy_italy_region_2011.index) #inserts date column at index 1
copy_italy_region_2011.reset_index(drop = True)


In [None]:
copy_italy_region_2011.to_csv("italy_region_2011_monthly_interest_over_time.csv", index=False)

Convert data to quarterly. 

In [None]:
#converts data to quarter and writes in csv file
def to_quarter(df):
    grouper = df.groupby(["Region", pd.Grouper(freq='Q')])
    region_df_quarter = grouper.mean().reset_index()
    return(region_df_quarter)
to_quarter(copy_italy_region_2010).to_csv("italy_region_2010_quarterly_interest_over_time.csv", index=False)
to_quarter(copy_italy_region_2011).to_csv("italy_region_2011_quarterly_interest_over_time.csv", index=False)

## Notes


In [None]:
topicID_list = [getTopicID(kw) for kw in kw_list]
comp_topicID = [getTopicID(kw) for kw in kw_comp_list] 
#keyword dictionary used to rename columns
#keys are ids, values are keywords
kw_dict = dict(zip(topicID_list, kw_list))
kw_dict.update(dict(zip(comp_topicID, kw_comp_list)))



Term search was done on these:

In [9]:
for key in kw_dict:
    if(key == kw_dict[key]):
        print(key)

Climate disaster
green investment
green production


Topic search was done on these terms:

In [11]:
for key in kw_dict:
    if(key != kw_dict[key]):
        print(kw_dict[key])

Global warming
climate change
greenhouse gas
renewable energy
sustainability
green energy
Job
health
education
drugs
