In [30]:
import pandas as pd
import numpy as np
import os


# import helper_functions.py
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../src/data')
import helper_functions as h

In [70]:
# load metadata and googletrends search interest
metadata_files = h.get_files('../data/raw', name_contains='*metadata*')
gtrends_files = h.get_files('../data/raw', name_contains='*gtrends*')

gtrends_column_names = ['date', 'keyword', 'search_interest']

list_metadata_df = [pd.read_csv(i) for i in metadata_files]
list_gtrends_df = [pd.read_csv(i, header=None, names=gtrends_column_names) for i in gtrends_files]

# consider only first query
df_metadata = list_metadata_df[0]
df = list_gtrends_df[0]

# Data preprocessing

The metadata and Google Trends query data will be used to do some checks and preprocess the data. The metadata serves as input for the query. Therefore, I use it as a ground truth to validate what the query returns since its output potentially suffers from connection errors or other exceptions.

The first step before preprocessing is testing the input data (*df_metadata*) against the output of the Google Trends API (*df*). This is called **validation** and compares how the ideal data should look like as defined through `metadata` and what the query returned in `gtrends` data. 

After **validation** comes **preprocessing** where we obtain a clean dataset.

## Validation

Validating the data centers around what the query received as input (a batch of five keywords stored in `metadata`) and what it returned. The `metadata` serves as a baseline and allows to compare how many rows there *should* be or how many unique keywords exist. 

### 1. How many keywords are there in the metadata and in total?

In [71]:
print(df_metadata.columns,'\n')

n_keywords_meta = df_metadata.keyword.nunique()
n_keywords_actual = df.keyword.nunique()
n_keywords_delta = n_keywords_meta - n_keywords_actual

print("{} unqiue keywords in metadata".format(n_keywords_meta))
print("{} unqiue keywords in actual data".format(n_keywords_actual))
print("==> {} keywords have no search interest data".format(n_keywords_delta))

Index(['topic', 'positive', 'date_define_topic', 'ticker', 'firm_name_raw',
       'sector', 'firm_name_processed', 'date_get_firmname', 'keyword',
       'date_construct_keyword', 'date_query_googletrends'],
      dtype='object') 

3835 unqiue keywords in metadata
3825 unqiue keywords in actual data
==> 10 keywords have no search interest data


#### List the 10 keywords that have no search interest data 
Calculate the set difference between the series with `pd.concat([df1,df2,df2])` and applying `drop_duplicates(keep=False)`

In [72]:
print(pd.concat([df_metadata.keyword, df.keyword, df.keyword]).drop_duplicates(keep=False))

1125         lawsuit Alexion Pharmaceuticals
1126          unfair Alexion Pharmaceuticals
1127             bad Alexion Pharmaceuticals
1128         problem Alexion Pharmaceuticals
1129            hate Alexion Pharmaceuticals
1975            issue American International
1976    controversial American International
1977           strike American International
1978             scam American International
1979          trouble American International
Name: keyword, dtype: object


### 2. How many rows does the `gtrends` data have ideally vs. actually?

In [73]:
entries_per_keyword = df.groupby('keyword').count().date.max()
n_ideal = n_keywords_meta * entries_per_keyword
n_actual = df.shape[0]
print("Each query returns {} entries per keyword".format(entries_per_keyword))
print("Ideal: {} * {} = {} rows. Actual: {}. ".format(entries_per_keyword, n_keywords_meta, n_ideal, n_actual))
print("==> Delta of {} rows (ideal-actual)".format(n_ideal-n_actual))

Each query returns 261 entries per keyword
Ideal: 261 * 3835 = 1000935 rows. Actual: 998325. 
==> Delta of 2610 rows (ideal-actual)


### 3. Does each keyword have the same number of entries?

In [74]:
entries_per_keyword_actual = df.groupby('keyword').count().date.value_counts().index[0]
print(f"Actual nr of entries per keyword: {entries_per_keyword_actual}")
if entries_per_keyword_actual == entries_per_keyword:
    print("Success: All keywords have the same number of entries")
else:
    print("Error: There is an issue with one of the keywords")

Actual nr of entries per keyword: 261
Success: All keywords have the same number of entries


## Preprocessing

Preprocessing continues in two steps. First, the date entries where 0 search interest was returned should be replaced with a correct date series of a query with positive search interest. Secondly, a test merge of  `gtrends` and `metadata` is done to ensure the datasets can be joined. 

### 1. for the *date* column replace 0 with the time series

In [75]:
def series_duplicate(s, times=2, axis=0, reset_index=True):
    """Create a dataframe by repeating series multiple times
    
    :param s: pandas series
    :param times: how often the series should be repeated 
    :param axis: repeat over rows (axis=0) or over columns (=1)
    :param reset_index: reset index to consecutive integer
    
    :return : pandas DataFrame or Series with repeated Series from args 
    """
    df_dup = pd.concat([s] * times, axis=axis)
    
    if reset_index:
        df_dup = df_dup.reset_index(drop=True)
    
    return df_dup 

# sequence of dates were inserted as placeholder, use .isin() to get them
date_sequence = [str(i) for i in range(261)]
date_sequence.append('0.0')

# get date series of a successful query
date_complete = pd.Series(df.date[~df.date.isin(date_sequence)].unique())

# for each keyword with date series == 0.0, replace series with date_complete
# simply expand date_complete to match n_rows of df with date == '0.0'

df_zerodate = df[df.date.isin(date_sequence)]
n_repeat = df_zerodate.keyword.unique().shape[0]

date_duplicate_series = series_duplicate(date_complete, n_repeat)
# ensure same index as df_zerodate
date_duplicate_series.index = df_zerodate.index
# replace date column
df_zerodate['date'] = date_duplicate_series
# insert back to original df
df.loc[df_zerodate.index,:] = df_zerodate

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [76]:
# unique dates should be 261
print("Unique dates:",df.date.nunique())

Unique dates: 261


### 2. take a sample and make a test merge of the main dataset and its metadata.  

Each row in `metadata` (*df_metadata*) contains a keyword. In contrast, each row in `gtrends` (*df*) contains search interest per week for a keyword which repeats across dates. Thus, we have to populate `metadata` as many times as there are unique dates for each keyword which is $261$.

To reduce computational expense, I take a sample of 10 keywords which corresponts to $10*261=2610$ rows in `gtrends`.

In [9]:
n_keywords = 10
n_dates = 261

sample = n_keywords * n_dates

df_sample = df.iloc[:sample,:].set_index('keyword')
df_metadata_sample = df_metadata.iloc[:n_keywords,:].set_index('keyword')

# join df and df_meta 
df_all_sample = df_sample.join(df_metadata_sample, on='keyword', how='left').reset_index()
df_all_sample.head(2)

Unnamed: 0,keyword,date,search_interest,topic,positive,date_define_topic,ticker,firm_name_raw,sector,firm_name_processed,date_get_firmname,date_construct_keyword,date_query_googletrends
0,scandal 3M,2015-10-18,0.0,scandal,0,2020-10-17,MMM,3M Company,Industrials,3M,2020-10-17,2020-10-17,2020-10-17
1,scandal 3M,2015-10-25,35.0,scandal,0,2020-10-17,MMM,3M Company,Industrials,3M,2020-10-17,2020-10-17,2020-10-17


Now, we can pack it into a function and apply it to the full dataset.

In [10]:
def join_query_meta(df_query, df_meta, id_col):
    """Left join of df_query on df_meta, where df_meta is the input to the query
    
    :param df_query: pandas DataFrame with query results
    :param df_meta: pandas DataFrame with input data for query
    :param id_col: string that specifies the identifying column common to both dataframes
    :return : DataFrame of joined datasets
    """
    # take id as index for both
    df_query_idcol = df_query.set_index(id_col)
    df_meta_idcol = df_meta.set_index(id_col)
    
    # join query and meta
    df_joined = df_query_idcol.join(df_meta_idcol, on=id_col, how='left').reset_index()
    
    return df_joined

In [11]:
df_all = join_query_meta(df, df_metadata, id_col='keyword')

In [12]:
h.make_csv(df_all, 'merged_gtrends_meta.csv', '../data/processed', header=True)

Path created: ../data/processed/merged_gtrends_meta.csv


In [13]:
# check correct storage
df = pd.read_csv('../data/processed/merged_gtrends_meta.csv')

### Final check

In [16]:
h.inspect_core_specifications(df, descriptives=True)
h.inspect_missings(df)

----------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 998325 entries, 0 to 998324
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   keyword                  998325 non-null  object 
 1   date                     998325 non-null  object 
 2   search_interest          998325 non-null  float64
 3   topic                    998325 non-null  object 
 4   positive                 998325 non-null  int64  
 5   date_define_topic        998325 non-null  object 
 6   ticker                   998325 non-null  object 
 7   firm_name_raw            998325 non-null  object 
 8   sector                   998325 non-null  object 
 9   firm_name_processed      998325 non-null  object 
 10  date_get_firmname        998325 non-null  object 
 11  date_construct_keyword   998325 non-null  object 
 12  date_query_googletrends  998325 non-null  object 
dtypes: float64(1), int

UnboundLocalError: local variable 'ds_colmiss' referenced before assignment

# Helper functions for data validation and preprocessing

*Note*: Other helper functions accessed by `h.` reside in `../src/data/`

In [122]:
def series_set_difference(ds1,ds2):
    """Obtain the set difference of two Series
    
    :param ds1: Pandas Series 1
    :param ds2: Pandas Series 2
    :return : set_difference
    """
    
    set_difference = pd.concat([ds1, ds2, ds2]).drop_duplicates(keep=False)
    
    return set_difference

def series_duplicate(s, times=2, axis=0, reset_index=True):
    """Create a dataframe by repeating series multiple times
    
    :param s: pandas series
    :param times: how often the series should be repeated 
    :param axis: repeat over rows (axis=0) or over columns (=1)
    :param reset_index: reset index to consecutive integer
    
    :return : pandas DataFrame or Series with repeated Series from args 
    """
    df_dup = pd.concat([s] * times, axis=axis)
    
    if reset_index:
        df_dup = df_dup.reset_index(drop=True)
    
    return df_dup 

In [72]:
series_set_difference(df_metadata.keyword, df.keyword)

1125         lawsuit Alexion Pharmaceuticals
1126          unfair Alexion Pharmaceuticals
1127             bad Alexion Pharmaceuticals
1128         problem Alexion Pharmaceuticals
1129            hate Alexion Pharmaceuticals
1975            issue American International
1976    controversial American International
1977           strike American International
1978             scam American International
1979          trouble American International
Name: keyword, dtype: object