# Data exploration

Questions to ask: 

1. How do values distribute for the main variable *search_interest*?
1. What are keywords with high search interest? 
2. What is the average search interest ...
    1. for a keyword?
    1. for a keyword that has at least 1 entry > 0?
    1. for a keyword that has at least 1 entry > 50?
    1. for a keyword that has at least 1 entry == 100?
3. How is search interest ...
    1. correlated with positive or negative ESG classification?
    1. correlated and distributed across industries?
    1. correlated and distributed across ESG classificatoin (positive/negative)?

In [6]:
import seaborn as sns 
sns.set_context('talk')
sns.set_style('whitegrid')

import pandas as pd
import numpy as np
import boto3
import s3fs

import matplotlib.pyplot as plt
%matplotlib inline

# import helper_functions.py
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../src/data')
import helper_functions as h

# set configuration
conf = {'region_name': 'us-west-2', 
        'bucket_name': 'esg-analytics', 
        'input_prefix': 'raw/', 
        'output_prefix': 'processed/',
        'select_files': ['20201017-191627gtrends_preprocessed.csv', 
                         '20201017-191627gtrends_metadata.csv'], 
       'export_files':['search_interest', 
                       'search_interest_meta',
                      'missing_search_interest']}

s3 = boto3.resource('s3')
bucket = s3.Bucket('esg-analytics')

# load files with .csv format from output_prefix
load_files= []
for obj in bucket.objects.filter(Prefix=conf['output_prefix']): 
    if obj.key.split('.')[-1] == 'csv':
        load_files.append(obj.key)

print('LOAD FILES\n'+'-'*40, *load_files, sep='\n')

df_missing = pd.read_csv(f"s3://{conf['bucket_name']}/{load_files[0]}")#, names=['keyword'])
df = pd.read_csv(f"s3://{conf['bucket_name']}/{load_files[1]}")#, names=['date', 'keyword', 'search_interest'])
df_meta = pd.read_csv(f"s3://{conf['bucket_name']}/{load_files[2]}") #, names=[]

LOAD FILES
----------------------------------------
processed/missing_search_interest/part-00000-7907bd4a-f8a5-412d-b638-4c9efcc6be19-c000.csv
processed/search_interest/part-00000-a119d364-b63e-47a0-b874-5e7771e8bbe8-c000.csv
processed/search_interest_meta/part-00000-3a16f3f1-266e-4037-98c4-e8837529daf3-c000.csv


# Data Quality checks (with Great Expectations) 

In [9]:
import great_expectations as ge

# load df into ge
ge_df = ge.from_pandas(df)
ge_meta = ge.from_pandas(df_meta)
ge_missing = ge.from_pandas(df_missing)

In [25]:
ge_df.expect_table_columns_to_match_ordered_list(['date', 'keyword', 'search_interest'])
ge_df.expect_column_values_to_be_between('search_interest', min_value=0, max_value=99)

{
  "meta": {},
  "success": false,
  "result": {
    "element_count": 998325,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 252,
    "unexpected_percent": 0.025242280820374125,
    "unexpected_percent_nonmissing": 0.025242280820374125,
    "partial_unexpected_list": [
      100,
      100,
      100,
      100,
      100,
      100,
      100,
      100,
      100,
      100,
      100,
      100,
      100,
      100,
      100,
      100,
      100,
      100,
      100,
      100
    ]
  },
  "exception_info": null
}

In [26]:
ge_df.get_expectation_suite(discard_failed_expectations=False)

{
  "meta": {
    "great_expectations_version": "0.12.9"
  },
  "data_asset_type": "Dataset",
  "expectations": [
    {
      "meta": {},
      "expectation_type": "expect_table_columns_to_match_ordered_list",
      "kwargs": {
        "column_list": [
          "date",
          "keyword",
          "search_interest"
        ]
      }
    },
    {
      "meta": {},
      "expectation_type": "expect_column_values_to_be_between",
      "kwargs": {
        "column": "search_interest",
        "min_value": 0,
        "max_value": 99
      }
    }
  ],
  "expectation_suite_name": "default"
}

## Merge of the API query and its metadata

Each row in `metadata` (*df_meta*) contains a keyword. In contrast, each row in `gtrends` (*df*) contains search interest per week for a keyword which repeats across dates. Thus, we have to populate `metadata` as many times as there are unique dates for each keyword which is $261$.

In [None]:
def join_query_meta(df_query, df_meta, id_col):
    """Left join of df_query on df_meta, where df_meta is the input to the query
    
    :param df_query: pandas DataFrame with query results
    :param df_meta: pandas DataFrame with input data for query
    :param id_col: string that specifies the identifying column common to both dataframes
    :return : DataFrame of joined datasets
    """
    # take id as index for both
    df_query_idcol = df_query.set_index(id_col)
    df_meta_idcol = df_meta.set_index(id_col)
    
    # join query and meta
    df_joined = df_query_idcol.join(df_meta_idcol, on=id_col, how='left').reset_index()
    
    return df_joined

df_all = join_query_meta(df_query=df, df_meta=df_meta, id_col='keyword')

h.make_csv(df_all, 'merged_gtrends_meta.csv', '../data/processed', header=True)

# check correct storage
df = pd.read_csv('../data/processed/merged_gtrends_meta.csv')

## Distribution of search_interest

In [None]:
g = sns.displot(df.search_interest)
ax = g.axes.flatten()[0]
ax.set_title('Search interest')
ax.set_xlabel('Search interest across all dates')
plt.show()

g = sns.displot(df.search_interest[df.search_interest > 0])
ax = g.axes.flatten()[0]
ax.set_title('Search interest > 0')
ax.set_xlabel('Search interest across all dates')
plt.show()

In [None]:
avg_search_interest = df.search_interest.mean()
median_search_interest = df.search_interest.median()

print("Search interest")
print('-'*40)
print(f'Average: {avg_search_interest} \nMedian: {median_search_interest}')

## Keywords and firms with highest search interest

highest average per keyword 

In [None]:
top_n = 10
print(f"{top_n} highest average search interest per keyword:\n",\
      df.groupby("keyword").mean().search_interest.sort_values(ascending=False)[:top_n]
     )

Most searched firms

In [None]:
print(f"{top_n} highest average search interest per firm:\n",\
      df.groupby('firm_name_processed').search_interest.mean().sort_values(ascending=False)[:top_n]
     )

Search interest == 100 across whole timespan

In [None]:
print("How often search interest reached maximum across the whole timespan:\n"
    ,df[df.search_interest == 100].keyword.value_counts()[:5])

What has been search most recently?

In [None]:
date_recent = df.date.tail().values
print("Recent average search activity\n", '-'*40)

print("What keywords have been searched recently from {} to {}:".format(date_recent[0], date_recent[-1]))

print(df[df.date.isin(date_recent)]\
    .groupby('keyword')\
    .search_interest\
    .mean()\
    .sort_values(ascending=False)[:top_n])

print('-'*40)

print("Which firms have been searched recently from {} to {}:".format(date_recent[0], date_recent[-1]))
print(df[df.date.isin(date_recent)]\
    .groupby('firm_name_processed')\
    .search_interest\
    .mean()\
    .sort_values(ascending=False)[:top_n])

## Correlation

In [None]:
print("correlation between search interest and ESG classification (positive=1/negative=0)")
print(round(df.corr().loc['search_interest','positive'], 3))