In [1]:
import pandas as pd
import numpy as np
import folium
from folium import plugins
import os

## Load data

In [2]:
def load_tweets(tweets_attributes_file, tweets_text_file):
    df = pd.read_csv(tweets_attributes_file, index_col=0)
    text = []
    with open(tweets_text_file, 'r') as f:
        for line in f:
            text.append(line)
    df['text'] = text
    return df

In [3]:
tweets_attributes_file = 'sandy_tweets_attributes_rev_geocoded_formatted_timestamps.csv'
tweets_text_file = 'sandy_tweets_text_tokenized.txt'

df = load_tweets(tweets_attributes_file, tweets_text_file)
#df = pd.read_csv(open('sandy_tweets_1.csv'), encoding='utf-8', engine='c')
#df.head()

In [5]:
df['latitude'].mean(), df['longitude'].mean()

(39.454459901212878, -77.311617231304638)

In [4]:
df['county'] = df['county'].astype(str) #.replace(np.nan,'', regex=True)
df['county_tweet_count'] = df.groupby(['state', 'county'])['tweet_id'].transform('count')
df.head(2)

Unnamed: 0,tweet_id,user_id,retweet_count,time_stamp,longitude,latitude,state,county,county_tweet_count,sentiment,text
0,260244125050363904,295902181,0,2012-10-22 05:00:09,-74.078101,40.735218,New Jersey,Hudson County,52264,1,all i wish is to be better than yesterday and ...
1,260244177412042752,85314436,0,2012-10-22 05:00:21,-81.50579,33.460462,South Carolina,Barnwell County,1375,0,@imSunnyAF yesssss lawd\n


## Filter Sandy related tweets

In [5]:
sandy_keywords = ['sandy', 'hurricane', 'hurricanesandy', 'storm', 'frankenstorm']
sandy_keywords = sandy_keywords + ['#'+kw for kw in sandy_keywords]

pattern = ' ' + ' | '.join(sandy_keywords) + ' '
pattern
df_filt = df[df['text'].str.contains(pattern)]

## Normalize tweets counts by total number tweets from that each county

In [6]:
weights = 1.0/df_filt['county_tweet_count'].values
weights = weights/max(weights)
weights

array([ 0.00520156,  0.0012047 ,  0.00015472, ...,  0.00027589,
        0.00083799,  0.00904977])

## Geospatial map

In [7]:
lats = df_filt['latitude'].values
lons = df_filt['longitude'].values
#np.mean(lats), np.mean(lons)

m = folium.Map(location=[np.mean(lats), np.mean(lons)], zoom_start=5)
m.add_children(plugins.HeatMap(zip(lats, lons, 100000*weights), radius = 6))
m.save(os.path.join('results', 'sandy_tweet_map_folium.html'))
m

## County averaged sentiment

In [8]:
df.head()

Unnamed: 0,tweet_id,user_id,retweet_count,time_stamp,longitude,latitude,state,county,county_tweet_count,sentiment,text
0,260244125050363904,295902181,0,2012-10-22 05:00:09,-74.078101,40.735218,New Jersey,Hudson County,52264,1,all i wish is to be better than yesterday and ...
1,260244177412042752,85314436,0,2012-10-22 05:00:21,-81.50579,33.460462,South Carolina,Barnwell County,1375,0,@imSunnyAF yesssss lawd\n
2,260244177105850368,239968255,0,2012-10-22 05:00:21,-77.099999,39.344184,Maryland,Carroll County,7707,1,"""Waiting for something better , a better you ...."
3,260244156729942016,703352862,0,2012-10-22 05:00:17,-80.90747,39.618102,West Virginia,Tyler County,175,1,Cool right ? #plainoldshirt http://t.co/B55dMS...
4,260244145694728192,581488152,0,2012-10-22 05:00:14,-76.579826,39.81645,Pennsylvania,York County,20191,0,#10PeopleYouTrulyLove My son\n


In [9]:
df['avg_sentiment'] = df[['sentiment', 'state', 'county']].groupby(['state', 'county'])['sentiment'].transform('sum')
df['avg_sentiment'] = df['avg_sentiment']/df['county_tweet_count']
df.head()

Unnamed: 0,tweet_id,user_id,retweet_count,time_stamp,longitude,latitude,state,county,county_tweet_count,sentiment,text,avg_sentiment
0,260244125050363904,295902181,0,2012-10-22 05:00:09,-74.078101,40.735218,New Jersey,Hudson County,52264,1,all i wish is to be better than yesterday and ...,0.201668
1,260244177412042752,85314436,0,2012-10-22 05:00:21,-81.50579,33.460462,South Carolina,Barnwell County,1375,0,@imSunnyAF yesssss lawd\n,0.285818
2,260244177105850368,239968255,0,2012-10-22 05:00:21,-77.099999,39.344184,Maryland,Carroll County,7707,1,"""Waiting for something better , a better you ....",0.136759
3,260244156729942016,703352862,0,2012-10-22 05:00:17,-80.90747,39.618102,West Virginia,Tyler County,175,1,Cool right ? #plainoldshirt http://t.co/B55dMS...,0.154286
4,260244145694728192,581488152,0,2012-10-22 05:00:14,-76.579826,39.81645,Pennsylvania,York County,20191,0,#10PeopleYouTrulyLove My son\n,0.17483


In [10]:
df_county_avg = df.drop_duplicates(subset=('state', 'county', 'county_tweet_count'))
df_county_avg = df_county_avg.drop(['tweet_id', 'user_id', 'retweet_count', 'time_stamp', 'longitude', 'latitude', 'text', 'sentiment'], 1)
df_county_avg.head(2)

Unnamed: 0,state,county,county_tweet_count,avg_sentiment
0,New Jersey,Hudson County,52264,0.201668
1,South Carolina,Barnwell County,1375,0.285818


In [11]:
smin = min(df_county_avg['avg_sentiment'])
smax = max(df_county_avg['avg_sentiment'])

df_county_avg[df_county_avg['avg_sentiment']==smax]

Unnamed: 0,state,county,county_tweet_count,avg_sentiment
4279288,Georgia,Jefferson County,1,1.0


In [12]:
df_county_avg = df_county_avg[df_county_avg['county_tweet_count']>1000]
len(df_county_avg)

446

## Normalize avg sentiment to [-1, 1]

In [13]:
smin = min(df_county_avg['avg_sentiment'])
smax = max(df_county_avg['avg_sentiment'])
smin, smax

(-0.0016072863648540047, 0.37666405638214567)

In [14]:
df_county_avg['avg_sentiment'] = df_county_avg['avg_sentiment'] - smin
df_county_avg['avg_sentiment'] = 2.0*df_county_avg['avg_sentiment']/max(df_county_avg['avg_sentiment']) - 1.0

In [15]:
smin = min(df_county_avg['avg_sentiment'])
smax = max(df_county_avg['avg_sentiment'])
smin, smax

(-1.0, 1.0)

## Add ZCTA information

In [16]:
df_us_counties = pd.read_csv('national_county.txt', names=['STATE', 'STATEFP', 'COUNTYFP', 'COUNTYNAME', 'CLASSFP'], 
                             dtype={'STATEFP': object, 'COUNTYFP': object})
df_us_counties['FIPS'] = df_us_counties[['STATEFP', 'COUNTYFP']].apply(lambda x: x[0]+x[1], axis=1)
df_us_counties.head(2)

Unnamed: 0,STATE,STATEFP,COUNTYFP,COUNTYNAME,CLASSFP,FIPS
0,AL,1,1,Autauga County,H1,1001
1,AL,1,3,Baldwin County,H1,1003


In [17]:
states_abbr = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}
#states_abbr = pd.DataFrame(states_abbr.items(), columns=['Abbreviation', 'State'])
#states_abbr.head(2)
states_abbr = dict((v,k) for k,v in states_abbr.iteritems())

In [18]:
def get_state_county_fips(state, county):
    #print (state, county)
    if state=='Washington, D.C.':
        fips = '11001'
        return fips    
    if state == 'New York' and county=='nan':
        county = 'New York County'        
    if state == 'New York' and county=='Bronx':
        county = 'Bronx County'        
    if 'City of ' in county:
        county = county.replace('City of ', '')
        county += ' city'
    if 'Saint ' in county:
        county = county.replace('Saint ', 'St. ')
    fips = df_us_counties[(df_us_counties['STATE']==states_abbr[state]) & (df_us_counties['COUNTYNAME']==county)]['FIPS'].values[0]
    return fips

df_county_avg = df_county_avg[df_county_avg['state']!='Ontario'] # Drop Ontario
df_county_avg['fips'] = df_county_avg[['state', 'county']].apply(lambda x: get_state_county_fips(x[0], x[1]), axis=1)    

#df_county_avg[(df_county_avg['state']=='New York') & (df_county_avg['county']=='Rensselaer County')]

In [19]:
df_zcta_county_rel_10 = pd.read_csv('zcta_county_rel_10.txt', usecols=['ZCTA5', 'STATE', 'COUNTY', 'GEOID'], 
                                    dtype={'ZCTA5':str, 'STATE':str, 'COUNTY':str, 'GEOID':str})

df_zcta_county_rel_10.head(2)

Unnamed: 0,ZCTA5,STATE,COUNTY,GEOID
0,601,72,1,72001
1,601,72,141,72141


In [20]:
df_us_counties = pd.read_csv('national_county.txt', names=['STATE', 'STATEFP', 'COUNTYFP', 'COUNTYNAME', 'CLASSFP'], 
                             dtype={'STATEFP': object, 'COUNTYFP': object})
df_us_counties['FIPS'] = df_us_counties[['STATEFP', 'COUNTYFP']].apply(lambda x: x[0]+x[1], axis=1)
df_us_counties.head(2)

Unnamed: 0,STATE,STATEFP,COUNTYFP,COUNTYNAME,CLASSFP,FIPS
0,AL,1,1,Autauga County,H1,1001
1,AL,1,3,Baldwin County,H1,1003


In [21]:
df_test = pd.merge(df_zcta_county_rel_10, df_us_counties, left_on='GEOID', right_on='FIPS', how='inner')
df_test = df_test.drop(['STATE_x', 'COUNTY', 'GEOID', 'STATE_y', 'STATEFP', 'COUNTYFP', 'COUNTYNAME', 'CLASSFP'], 1)
df_zcta_county_rel_10 = df_test
#df_zcta_county_rel_10[df_zcta_county_rel_10['FIPS']=='36083']

In [22]:
df_sandy_zcta_choropleth = pd.merge(df_county_avg, df_zcta_county_rel_10, left_on='fips', right_on='FIPS')
df_sandy_zcta_choropleth = df_sandy_zcta_choropleth.drop('fips', 1)
#df_sandy_zcta_choropleth[(df_sandy_zcta_choropleth['state']=='New York') & (df_sandy_zcta_choropleth['county']=='Rensselaer County')]
df_sandy_zcta_choropleth = df_sandy_zcta_choropleth.drop(['state', 'county'], 1)
df_sandy_zcta_choropleth.head(2)

Unnamed: 0,county_tweet_count,avg_sentiment,ZCTA5,FIPS
0,52264,0.074762,7002,34017
1,52264,0.074762,7029,34017


In [23]:
df_sandy_zcta_choropleth.to_csv('df_sandy_zcta_choropleth.csv', index=False)

## CartoDB geocode
Import ZCTA as string. First row 06076 --> "06076". Delete quotes manually.

UPDATE df_sandy_zcta_choropleth SET the_geom = cdb_geocode_postalcode_polygon(ZCTA5, 'USA')