### Import necessary stuff

In [1]:
import os
import time
import json

import numpy as np
import pandas as pd
from pytrends.request import TrendReq

pytrends = TrendReq(hl='en-US', tz=360)

### Attributes to be changed depending on the retrieval

In [2]:
kw_list = ['']
date_range = lambda year : f'{year}-01-01 {year}-12-31'
cats = [1220, 38, 60]#, 310, 899, 229, 277, 75, 366, 59, 54, 29, 78, 67, 1140, 1147]

df_list = []
for i in range(1, 4):
    df_list.append(pd.read_csv(f'../census-google_trends-matching/data/metro_areas_table_{i}.csv'))

year_list = []
for i in range(3):
    years = []
    for col in df_list[i].columns:
        if 'median_income_' in col:
            years.append(col.split('_')[-1])
    years.reverse()
    year_list.append(years)

In [3]:
print(df_list[0].head())
print(year_list)

                        name state_id  ...  median_income_2019  median_income_2020
0               Baltimore MD       MD  ...            81272.49            83348.86
1  Flint-Saginaw-Bay City MI       MI  ...            49142.99            50455.48
2                 Buffalo NY       NY  ...            56389.05            57887.94
3              Cincinnati OH       IN  ...            63838.19            66076.05
4                    Erie PA       PA  ...            51341.98            52756.56

[5 rows x 13 columns]
[['2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011'], ['2019', '2018', '2017', '2016', '2015', '2014'], ['2021', '2019', '2018', '2017', '2016', '2015', '2014']]


In [4]:
years_indexing, geo_indexing= [], []
lookup_table = dict()
for i in range(3):
    df_list[i].reset_index()
    years = year_list[i]
    for year in years:
        for _, row in df_list[i].iterrows():
            geo = f"US-{row['state_id']}-{row['id']}"
            key = f"{geo}-{year}"
            if key not in lookup_table.keys():
                lookup_table[key] = 0
                years_indexing.append(year)
                geo_indexing.append(geo)

### If the index file is not present save them

In [5]:
df = pd.DataFrame({'geo': geo_indexing, 'year': years_indexing})
df.head()
df.to_csv('data/geo_year_idx.csv')

### Utility functions for data retrieval

In [6]:
def feature_valid(feature):
    if (feature != feature).any():
        return False
    if (feature > 100).any() or (feature < 0).any():
        return False
    return True

### Continue retrieval from checkpoint

In [17]:
ckpt_dir = 'data/trends_data_1.json'
save_dir = 'data/trends_data_2.json'
feat_json = dict()
if ckpt_dir == '' or not os.path.exists(ckpt_dir):
    print("Checkpoint not found!")
else:
    with open(ckpt_dir, 'r') as f:
        feat_json = json.load(f)

In [20]:
for i in range(3):
    df_list[i].reset_index()
    years = year_list[i]
    for year in years:
        for index, row in df_list[i].iterrows():
            geo = f"US-{row['state_id']}-{row['id']}"
            key = f"{geo}-{year}"
            if key in feat_json.keys():
                continue
            feats = []
            for cat in cats:
                for num_trials in range(4):
                    try:
                        pytrends.build_payload(kw_list, cat=cat, timeframe=date_range(year), geo=geo, gprop='')
                        feat = pytrends.interest_over_time()
                        break
                    except:
                        time.sleep(60.0)
                if num_trials == 3:
                    raise Exception("Failed to retrieve data!")

                f_numpy = feat[''].to_numpy()[:52]
                if not feature_valid(f_numpy):
                    print("Invalid feature!")
                feats.append(f_numpy)
            feat_final = np.array(feats)
            feat_json[key] = feat_final.tolist()
            with open(save_dir, 'w') as f:
                json.dump(feat_json, f, indent=4)
            print(f"Total of {len(feat_json.keys())} points!", end="\r")

Total of 738 points!

Exception: Failed to retrieve data!

In [18]:
len(feat_json.keys())

729

### Remove invalid data on feat_list manually here

In [None]:
# Remove invalid data
# feat_list = ...
feat_list = np.array(feat_list)

### Save the data

In [None]:
prev_data = np.load(ckpt_dir)
full_feat = np.concatenate((prev_data, feat_list), axis=0)
np.save(save_dir, full_feat)