In [1]:
import pandas as pd
import numpy as np
import folium
from folium import plugins
import os

## Load data

In [2]:
df = pd.read_csv('sandy_pow_out_tweets_naive_bayes_1.csv', index_col=0)
df['county'] = df['county'].astype(str)
#print(df[df['county']=='nan'])
print (len(df))
df.head(2)

7067


Unnamed: 0,tweet_id,user_id,retweet_count,time_stamp,longitude,latitude,state,county,county_tweet_count,sentiment,pow_outage_label
24303,260455999276728320,186545667,0,2012-10-22 19:02:03,-72.255078,41.809409,Connecticut,Tolland County,6983,1,1
123333,260923579355508736,293642193,0,2012-10-24 02:00:03,-80.556654,35.347399,North Carolina,Cabarrus County,15521,1,1


## Calculate total number of tweets from each county

In [3]:
df_all = pd.read_csv('sandy_tweets_attributes_rev_geocoded_formatted_timestamps.csv', index_col=0)
#df_all['county'] = df_all['county'].replace(np.nan,'', regex=True)
df_all['county'] = df_all['county'].astype(str)
df_all['county_tweet_count'] = df_all.groupby(['state', 'county'])['tweet_id'].transform('count')
df['county_tweet_count'] = df_all['county_tweet_count'].iloc[df.index.values]
df.head(2)

Unnamed: 0,tweet_id,user_id,retweet_count,time_stamp,longitude,latitude,state,county,county_tweet_count,sentiment,pow_outage_label
24303,260455999276728320,186545667,0,2012-10-22 19:02:03,-72.255078,41.809409,Connecticut,Tolland County,6983,1,1
123333,260923579355508736,293642193,0,2012-10-24 02:00:03,-80.556654,35.347399,North Carolina,Cabarrus County,15521,1,1


## Power outage tweets dataframe 

In [4]:
df['county_pow_out_count'] = df.groupby(['state', 'county'])['tweet_id'].transform('count')
df['county_pow_out_ratio'] = df['county_pow_out_count']/df['county_tweet_count']
df_pow_out_choropleth = df[['state', 'county', 'county_pow_out_ratio', 'county_tweet_count']].drop_duplicates()
df.drop(['county_pow_out_ratio'], axis=1, inplace=True)

print (max(df_pow_out_choropleth['county_pow_out_ratio']))
print (min(df_pow_out_choropleth['county_pow_out_ratio']))
df_pow_out_choropleth.head(2)

0.0454545454545
3.42102562348e-05


Unnamed: 0,state,county,county_pow_out_ratio,county_tweet_count
24303,Connecticut,Tolland County,0.002291,6983
123333,North Carolina,Cabarrus County,0.000193,15521


In [5]:
df_pow_out_choropleth[df_pow_out_choropleth['county']=='Rensselaer County']

Unnamed: 0,state,county,county_pow_out_ratio,county_tweet_count
3737866,New York,Rensselaer County,0.000177,5640


In [6]:
df_pow_out_choropleth[df_pow_out_choropleth['state']=='New York']

Unnamed: 0,state,county,county_pow_out_ratio,county_tweet_count
222975,New York,Nassau County,0.005619,45917
356713,New York,Dutchess County,0.001593,13185
407706,New York,Queens County,0.003485,58831
414202,New York,Schenectady County,0.000619,4844
416422,New York,Suffolk County,0.00437,54009
481751,New York,Kings County,0.001843,46132
504551,New York,New York County,0.002144,68107
511060,New York,Onondaga County,0.00023,21755
512883,New York,Monroe County,0.000361,35964
519427,New York,Westchester County,0.004891,31486


## Geospatial map

### Normalize tweet counts by total number tweets from that each county

In [7]:
weights = 1.0/df['county_tweet_count'].values
weights = weights/max(weights)
weights

array([ 0.00315051,  0.00141743,  0.00047913, ...,  0.00315051,
        0.00055319,  0.00042609])

In [8]:
lats = df['latitude'].values
lons = df['longitude'].values
#np.mean(lats), np.mean(lons)

m = folium.Map(location=[np.mean(lats), np.mean(lons)], zoom_start=5)
m.add_children(plugins.HeatMap(zip(lats, lons, 100000*weights), radius = 6))
m.save(os.path.join('results', 'power_outage_map_folium.html'))
m

In [9]:
df_us_counties = pd.read_csv('national_county.txt', names=['STATE', 'STATEFP', 'COUNTYFP', 'COUNTYNAME', 'CLASSFP'], 
                             dtype={'STATEFP': object, 'COUNTYFP': object})
df_us_counties['FIPS'] = df_us_counties[['STATEFP', 'COUNTYFP']].apply(lambda x: x[0]+x[1], axis=1)
df_us_counties.head()

Unnamed: 0,STATE,STATEFP,COUNTYFP,COUNTYNAME,CLASSFP,FIPS
0,AL,1,1,Autauga County,H1,1001
1,AL,1,3,Baldwin County,H1,1003
2,AL,1,5,Barbour County,H1,1005
3,AL,1,7,Bibb County,H1,1007
4,AL,1,9,Blount County,H1,1009


In [10]:
states_abbr = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}
#states_abbr = pd.DataFrame(states_abbr.items(), columns=['Abbreviation', 'State'])
#states_abbr.head(2)
states_abbr = dict((v,k) for k,v in states_abbr.iteritems())

In [11]:
df_us_counties.head(2)

Unnamed: 0,STATE,STATEFP,COUNTYFP,COUNTYNAME,CLASSFP,FIPS
0,AL,1,1,Autauga County,H1,1001
1,AL,1,3,Baldwin County,H1,1003


In [12]:
df_us_counties[(df_us_counties['STATE']=='NY') & (df_us_counties['COUNTYNAME']=='Rensselaer County')]['FIPS']
#df_pow_out_choropleth[df_pow_out_choropleth['state']=='New York']

1869    36083
Name: FIPS, dtype: object

In [13]:
df_pow_out_choropleth[(df_pow_out_choropleth['state']=='New York') & (df_pow_out_choropleth['county']=='Bronx')]

Unnamed: 0,state,county,county_pow_out_ratio,county_tweet_count
657475,New York,Bronx,0.001827,18612


In [14]:
df_us_counties[(df_us_counties['STATE']=='NY') & (df_us_counties['COUNTYNAME']=='Bronx County')]#['FIPS'].values[0]


Unnamed: 0,STATE,STATEFP,COUNTYFP,COUNTYNAME,CLASSFP,FIPS
1830,NY,36,5,Bronx County,H6,36005


In [15]:
def get_state_county_fips(state, county):
    #print (state, county)
    if state=='Washington, D.C.':
        fips = '11001'
        return fips    
    if state == 'New York' and county=='nan':
        county = 'New York County'        
    if state == 'New York' and county=='Bronx':
        county = 'Bronx County'        
    if 'City of ' in county:
        county = county.replace('City of ', '')
        county += ' city'
    if 'Saint ' in county:
        county = county.replace('Saint ', 'St. ')
    fips = df_us_counties[(df_us_counties['STATE']==states_abbr[state]) & (df_us_counties['COUNTYNAME']==county)]['FIPS'].values[0]
    return fips

df_pow_out_choropleth['fips'] = df_pow_out_choropleth[['state', 'county']].apply(lambda x: get_state_county_fips(x[0], x[1]), axis=1)    

#df_pow_out_choropleth[(df_pow_out_choropleth['state']=='New York') & (df_pow_out_choropleth['county']=='Rensselaer County')]

In [1]:
#df_pow_out_choropleth.to_csv('df_pow_out_choropleth.csv', index=False)
#df_pow_out_choropleth.head()

In [17]:
df_pow_out_choropleth[(df_pow_out_choropleth['state']=='New York') & (df_pow_out_choropleth['county']=='Rensselaer County')]

Unnamed: 0,state,county,county_pow_out_ratio,county_tweet_count,fips
3737866,New York,Rensselaer County,0.000177,5640,36083


In [18]:
df_zcta_county_rel_10 = pd.read_csv('zcta_county_rel_10.txt', usecols=['ZCTA5', 'STATE', 'COUNTY', 'GEOID'], 
                                    dtype={'ZCTA5':str, 'STATE':str, 'COUNTY':str, 'GEOID':str})

df_zcta_county_rel_10.head()

Unnamed: 0,ZCTA5,STATE,COUNTY,GEOID
0,601,72,1,72001
1,601,72,141,72141
2,602,72,3,72003
3,603,72,5,72005
4,606,72,93,72093


In [19]:
df_us_counties = pd.read_csv('national_county.txt', names=['STATE', 'STATEFP', 'COUNTYFP', 'COUNTYNAME', 'CLASSFP'], 
                             dtype={'STATEFP': object, 'COUNTYFP': object})
df_us_counties['FIPS'] = df_us_counties[['STATEFP', 'COUNTYFP']].apply(lambda x: x[0]+x[1], axis=1)
df_us_counties.head()

Unnamed: 0,STATE,STATEFP,COUNTYFP,COUNTYNAME,CLASSFP,FIPS
0,AL,1,1,Autauga County,H1,1001
1,AL,1,3,Baldwin County,H1,1003
2,AL,1,5,Barbour County,H1,1005
3,AL,1,7,Bibb County,H1,1007
4,AL,1,9,Blount County,H1,1009


In [20]:
df_test = pd.merge(df_zcta_county_rel_10, df_us_counties, left_on='GEOID', right_on='FIPS', how='inner')
df_test = df_test.drop(['STATE_x', 'COUNTY', 'GEOID', 'STATE_y', 'STATEFP', 'COUNTYFP', 'COUNTYNAME', 'CLASSFP'], 1)
df_zcta_county_rel_10 = df_test
df_zcta_county_rel_10[df_zcta_county_rel_10['FIPS']=='36083']

Unnamed: 0,ZCTA5,FIPS
3611,12018,36083
3612,12022,36083
3613,12028,36083
3614,12033,36083
3615,12040,36083
3616,12052,36083
3617,12057,36083
3618,12061,36083
3619,12062,36083
3620,12063,36083


In [21]:
df_pow_out_choropleth[df_pow_out_choropleth['county']=='Rensselaer County']

Unnamed: 0,state,county,county_pow_out_ratio,county_tweet_count,fips
3737866,New York,Rensselaer County,0.000177,5640,36083


In [64]:
df_pow_out_zcta_choropleth = pd.merge(df_pow_out_choropleth, df_zcta_county_rel_10, left_on='fips', right_on='FIPS')
df_pow_out_zcta_choropleth = df_pow_out_zcta_choropleth.drop('fips', 1)

df_pow_out_zcta_choropleth[(df_pow_out_zcta_choropleth['state']=='New York') & (df_pow_out_zcta_choropleth['county']=='Rensselaer County')]

Unnamed: 0,state,county,county_pow_out_ratio,county_tweet_count,ZCTA5,FIPS
7831,New York,Rensselaer County,0.000177,5640,12018,36083
7832,New York,Rensselaer County,0.000177,5640,12022,36083
7833,New York,Rensselaer County,0.000177,5640,12028,36083
7834,New York,Rensselaer County,0.000177,5640,12033,36083
7835,New York,Rensselaer County,0.000177,5640,12040,36083
7836,New York,Rensselaer County,0.000177,5640,12052,36083
7837,New York,Rensselaer County,0.000177,5640,12057,36083
7838,New York,Rensselaer County,0.000177,5640,12061,36083
7839,New York,Rensselaer County,0.000177,5640,12062,36083
7840,New York,Rensselaer County,0.000177,5640,12063,36083


In [65]:
a = min(df_pow_out_zcta_choropleth['county_pow_out_ratio'])
b = max(df_pow_out_zcta_choropleth['county_pow_out_ratio'])

In [67]:
df_pow_out_zcta_choropleth[df_pow_out_zcta_choropleth['county_pow_out_ratio']==b]

Unnamed: 0,state,county,county_pow_out_ratio,county_tweet_count,ZCTA5,FIPS
3475,West Virginia,Doddridge County,0.045455,22,26339,54017
3476,West Virginia,Doddridge County,0.045455,22,26411,54017
3477,West Virginia,Doddridge County,0.045455,22,26415,54017
3478,West Virginia,Doddridge County,0.045455,22,26426,54017
3479,West Virginia,Doddridge County,0.045455,22,26436,54017
3480,West Virginia,Doddridge County,0.045455,22,26443,54017
3481,West Virginia,Doddridge County,0.045455,22,26448,54017
3482,West Virginia,Doddridge County,0.045455,22,26456,54017


In [68]:
max(df_pow_out_zcta_choropleth['county_tweet_count'])

87991

In [69]:
len(df_pow_out_zcta_choropleth['county_tweet_count'][df_pow_out_zcta_choropleth['county_tweet_count']<1000])

660

In [70]:
df_pow_out_zcta_choropleth = df_pow_out_zcta_choropleth[df_pow_out_zcta_choropleth['county_tweet_count'] > 1000]

In [79]:
df_pow_out_zcta_choropleth = df_pow_out_zcta_choropleth[df_pow_out_zcta_choropleth['county_pow_out_ratio'] > 0.0005]

In [80]:
len(df_pow_out_zcta_choropleth)

4960

In [81]:
df_pow_out_zcta_choropleth.to_csv('df_pow_out_zcta_choropleth.csv', index=False)
df_pow_out_zcta_choropleth.head()

Unnamed: 0,state,county,county_pow_out_ratio,county_tweet_count,ZCTA5,FIPS
0,Connecticut,Tolland County,0.002291,6983,6029,9013
1,Connecticut,Tolland County,0.002291,6983,6043,9013
2,Connecticut,Tolland County,0.002291,6983,6066,9013
3,Connecticut,Tolland County,0.002291,6983,6071,9013
4,Connecticut,Tolland County,0.002291,6983,6076,9013


In [82]:
a = min(df_pow_out_zcta_choropleth['county_pow_out_ratio'])
b = max(df_pow_out_zcta_choropleth['county_pow_out_ratio'])
for i in xrange(5):
    print (a + (b-a)*i/5)

0.000516440006886
0.00247197553492
0.00442751106296
0.00638304659099
0.00833858211902


### CartoDB geocode
UPDATE df_pow_out_zipcode_choropleth set description = '0'||zip where char_length(zip) = 4

Import ZCTA as string. First row 06076 --> "06076". Delete quotes manually.

UPDATE df_pow_out_zcta_choropleth SET the_geom = cdb_geocode_postalcode_polygon(ZCTA5, 'USA')

In [None]:
import folium
import json

county_geo = r'us-counties.json'

map1 = folium.Map(location=[39.8282, -98.5795], zoom_start=4)
map1.geo_json(county_geo, data=df_pow_out_choropleth,
             columns=['FIPS_Code', 'county_pow_out_ratio'], key_on='feature.id',
             fill_color='YlGnBu', line_opacity=0.3)#,
             #quantize_range=[0, 5000])
map1              