Let's load all the data we plan to use for this model and set the times as the index

In [1]:
import pandas as pd


In [10]:
cdc = pd.read_csv("cdc_average_bystate_2013-2017.csv")
gtrends = pd.read_csv("gtrends_flu_shot.csv")
twitter = pd.read_csv("tweets_with_location.csv") #thanks to Wik for generating this one

In [12]:
cdc.set_index('time', inplace=True)
gtrends.set_index('date', inplace=True)
twitter.set_index('date', inplace=True)

In [19]:
cdc.index = pd.to_datetime(cdc.index)
gtrends.index = pd.to_datetime(gtrends.index)
twitter.index = pd.to_datetime(twitter.index)

cdc.sort_index(inplace=True)
gtrends.sort_index(inplace=True)
twitter.sort_index(inplace=True)

Since our twitter data only goes back 3 years, we will start with the 2015 data

In [37]:
cdc = cdc[cdc.index > pd.Timestamp('2015-05-01')]



In [38]:
#will calculate the increment of cdc data which starts in july and then adds up every month
def cdc_increment(timeCol, pctCol):
    diffCol = []
    for i in range(len(timeCol)):
        if timeCol[i].month != 7:
            diffCol.append(pctCol[i]-pctCol[i-1])
        else:
            diffCol.append(pctCol[i])
            
    return diffCol
            
    

In [18]:
#function that grabs last 6 gtrends points for every CDC date
def combine_cdc_gtrends(gtrends_df, cdc_df):
    cdc_index = cdc_df.index
    cdc_values = cdc_df.mean_pct.values
    gtrends_index = gtrends_df.index
    row_dict = {}
    for cdc_date, cdc_value in zip(cdc_index, cdc_values):
        ind = gtrends_index.searchsorted(cdc_date)
        if (gtrends_index[ind] > cdc_date):
            ind = ind - 1
        last_six_gtrends = gtrends_df.iloc[ind-5:ind+1,0].values
        row_dict[cdc_date] = [cdc_value] + last_six_gtrends.tolist()
    
    combined_df = pd.DataFrame.from_dict(row_dict, orient='index')
    combined_df.columns = ['cdc', 'gtrends-5', 'gtrends-4', 'gtrends-3', 'gtrends-2', 'gtrends-1', 'gtrends-0']
    return combined_df

This function will grab a single state and combine gtrends and twitter:

In [39]:
def get_state_target_df(cdc_df, gtrends_df, twitter_df, state_abbr, state_name):
    cdc_state = cdc_df[cdc_df.state == state_name]
    gtrends_state = gtrends_df[[state_abbr]]
    twitter_state = twitter_df[[state_name]]

    state_df = combine_cdc_gtrends(gtrends_state, cdc_state)
    
    twitter_state['twitter-1'] = twitter_state[state_name].shift(periods=1)
    twitter_state['twitter-2'] = twitter_state[state_name].shift(periods=2)
    twitter_state.columns = ['twitter-0', 'twitter-1', 'twitter-2']
    
    state_df = state_df.join(twitter_state)
    
    return state_df
    

Now let's check that things make sense for WA state

In [40]:
wa_df = get_state_target_df(cdc, gtrends, twitter, 'WA', 'Washington')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [45]:
wa_df.head(20)

Unnamed: 0,cdc,gtrends-5,gtrends-4,gtrends-3,gtrends-2,gtrends-1,gtrends-0,twitter-0,twitter-1,twitter-2,cdc_monthly
2015-05-15,49.5,3,2,3,3,2,1,16.0,54.0,114.0,1.0
2015-07-15,0.5,2,1,1,3,1,1,7.0,11.0,16.0,0.5
2015-08-15,1.8,1,1,1,2,1,2,5.0,7.0,11.0,1.3
2015-09-15,9.2,2,6,7,9,12,21,21.0,5.0,7.0,7.4
2015-10-15,28.0,12,21,33,47,45,47,47.0,21.0,5.0,18.8
2015-11-15,38.2,47,52,44,35,28,27,14.0,47.0,21.0,10.2
2015-12-15,41.8,28,27,14,12,14,12,13.0,14.0,47.0,3.6
2016-01-15,44.4,14,12,6,4,7,8,19.0,13.0,14.0,2.6
2016-02-15,46.4,8,12,8,7,6,3,10.0,19.0,13.0,2.0
2016-03-15,47.2,6,3,5,6,6,3,12.0,10.0,19.0,0.8


In [42]:
wa_df["cdc_monthly"] = cdc_increment(wa_df.index, wa_df.cdc)

The correlation matrix is reasonable:

In [44]:
wa_df.corr()

Unnamed: 0,cdc,gtrends-5,gtrends-4,gtrends-3,gtrends-2,gtrends-1,gtrends-0,twitter-0,twitter-1,twitter-2,cdc_monthly
cdc,1.0,0.36816,0.283882,0.209132,0.155065,0.087561,0.006609,0.210138,0.462641,0.409438,-0.138216
gtrends-5,0.36816,1.0,0.931377,0.741304,0.627223,0.502763,0.386736,0.237146,0.555063,0.051581,0.31298
gtrends-4,0.283882,0.931377,1.0,0.90659,0.789947,0.670324,0.556458,0.281665,0.515551,-0.047466,0.511738
gtrends-3,0.209132,0.741304,0.90659,1.0,0.946974,0.879536,0.800807,0.483687,0.426014,-0.162951,0.710998
gtrends-2,0.155065,0.627223,0.789947,0.946974,1.0,0.963758,0.885405,0.66507,0.309754,-0.19929,0.804977
gtrends-1,0.087561,0.502763,0.670324,0.879536,0.963758,1.0,0.963062,0.719967,0.190964,-0.248082,0.852798
gtrends-0,0.006609,0.386736,0.556458,0.800807,0.885405,0.963062,1.0,0.703706,0.090048,-0.286205,0.861506
twitter-0,0.210138,0.237146,0.281665,0.483687,0.66507,0.719967,0.703706,1.0,0.211172,-0.028658,0.575103
twitter-1,0.462641,0.555063,0.515551,0.426014,0.309754,0.190964,0.090048,0.211172,1.0,0.522582,0.111079
twitter-2,0.409438,0.051581,-0.047466,-0.162951,-0.19929,-0.248082,-0.286205,-0.028658,0.522582,1.0,-0.279517
