### Import necessary stuff

In [32]:
import os
import time

import numpy as np
import pandas as pd
from pytrends.request import TrendReq

pytrends = TrendReq(hl='en-US', tz=360)

### Attributes to be changed depending on the retrieval

In [33]:
kw_list = ['']
date_range = lambda year : f'{year}-01-01 {year}-12-31'
cats = [47, 16]

df_list = []
for i in range(1, 4):
    df_list.append(pd.read_csv(f'../census-google_trends-matching/data/metro_areas_table_{i}.csv'))

year_list = []
for i in range(3):
    years = []
    for col in df_list[i].columns:
        if 'median_income_' in col:
            years.append(col.split('_')[-1])
    years.reverse()
    year_list.append(years)

feat_list = dict()

In [34]:
print(df_list[0].head())
print(year_list)

                        name state_id   id  median_income_2011  \
0               Baltimore MD       MD  512            68391.04   
1  Flint-Saginaw-Bay City MI       MI  513            43836.32   
2                 Buffalo NY       NY  514            47179.47   
3              Cincinnati OH       IN  515            54131.45   
4                    Erie PA       PA  516            43991.54   

   median_income_2012  median_income_2013  median_income_2014  \
0            68928.28            69424.24            70117.38   
1            43635.57            43021.42            43496.56   
2            48039.91            48387.03            48968.55   
3            54211.58            54087.13            54643.44   
4            44754.73            44871.36            45493.37   

   median_income_2015  median_income_2016  median_income_2017  \
0            70638.12            72217.83            74800.76   
1            43838.59            44553.31            46042.25   
2            4923

In [44]:
years_indexing, geo_indexing= [], []
lookup_table = dict()
for i in range(3):
    df_list[i].reset_index()
    years = year_list[i]
    for year in years:
        for _, row in df_list[i].iterrows():
            geo = f"US-{row['state_id']}-{row['id']}"
            key = f"{geo}-{year}"
            if key not in lookup_table.keys():
                lookup_table[key] = 0
                years_indexing.append(year)
                geo_indexing.append(geo)

### If the index file is not present save them

In [46]:
df = pd.DataFrame({'geo': geo_indexing, 'year': years_indexing})
df.head()
df.to_csv('data/geo_year_idx.csv')

### Utility functions for data retrieval

In [47]:
def feature_valid(feature):
    if (feature != feature).any():
        return False
    if (feature > 100).any() or (feature < 0).any():
        return False
    return True

### Continue retrieval from checkpoint

In [None]:
ckpt_dir = ''
save_dir = 'data/trends_data_0.npy'
cur_data_size = 0
if ckpt_dir == '' or not os.path.exists(ckpt_dir):
    print("Checkpoint not found!")
else:
    ckpt_data = np.load(ckpt_dir)
    cur_data_size = ckpt_data.shape[0]

In [36]:
feat_list = []

for i in range(3):
    df_list[i].reset_index()
    years = year_list[i]
    for year in years:
        for index, row in df_list[i].iterrows():
            geo = f"US-{row['state_id']}-{row['id']}"
            key = f"{geo}-{year}"
            if lookup_table[key] != 0:
                continue
            if cur_data_size != 0:
                lookup_table[key] = 1
                cur_data_size -= 1
                continue
            feats = []
            for cat in cats:
                pytrends.build_payload(kw_list, cat=cat, timeframe=date_range(year), geo=geo, gprop='')
                feat = pytrends.interest_over_time()
                if not feature_valid(feat):
                    print("Invalid feature!")
                feats.append(feat[''].to_numpy()[:52])
                time.sleep((np.random.rand() + 0.2) * 5)
            feat_list.append(np.array(feats))
            lookup_table[key] = 1

(2, 52)


### Remove invalid data on feat_list manually here

In [None]:
# Remove invalid data
# feat_list = ...
feat_list = np.array(feat_list)

### Save the data

In [None]:
prev_data = np.load(ckpt_dir)
full_feat = np.concatenate((prev_data, feat_list), axis=0)
np.save(save_dir, full_feat)