In [None]:
import pandas as pd
import numpy as np
from scipy import stats

from weighted_kde import *
import tweet_tokenizer

In [None]:
# Load tweet data
power_outge_tweets_file = 'sandy_pow_out_tweets_naive_bayes.csv'
df = pd.read_csv(power_outge_tweets_file, index_col=0)
#df = pd.read_csv(open('sandy_tweets_1.csv'), encoding='utf-8', engine='c')
df.head()

In [None]:
len(df)

In [None]:
# Replace missing value in 'county' column by empty string. e.g. Washington D.C.
df['county'] = df['county'].replace(np.nan,'', regex=True)

In [None]:
# Groupby state-conunty pairs and save counts in a new column
# See http://stackoverflow.com/questions/17432944/python-pandas-error-when-doing-groupby-counts

#df['county_tweet_count'] = df.groupby(['state', 'county']).transform('count')
df['county_tweet_count'] = df.groupby(['state', 'county'])['tweet_id'].transform('count')

# Test print
print (len(df[df['state']+df['county'] == 'New JerseyHudson County']))
df.head(6)

In [None]:
df_filt = df[df['pow_out_label']==1]

In [None]:
print ('Total number of Sandy related tweets: %d' % len(df_filt))

In [None]:
df_filt.head()

## Google map plots

In [None]:
import gmaps
import gmaps.datasets

gmaps.configure(api_key="AIzaSyAWVuwTc9I0KC6jskl6_yZwOzhhG8J3MsA") # Your Google API key

In [None]:
data = list(df_filt[['latitude', 'longitude']].apply(tuple, axis=1))
data[:3]

In [None]:
from matplotlib import cm

rgba = []
for i in xrange(255):
    rgba_val = list(cm.Reds(i))
    rgba_val = [int(v*255) for v in rgba_val]
    rgba_val[3] = 0.99*(1-i/256.0)
    rgba.append(tuple(rgba_val))
    
rgba

In [None]:
# Min and max lat-lon for whole dataset
lon_min, lon_max = df['longitude'].values.min(), df['longitude'].values.max()
lat_min, lat_max = df['latitude'].values.min(), df['latitude'].values.max()

# Map Sandy related Twitter activity on Google Maps
# Define our longitude and latitude points
# Here we use only power outage related data
filtered_lon_vals, filtered_lat_vals = df_filt['longitude'].values, df_filt['latitude'].values

data = list(df_filt[['latitude', 'longitude']].apply(tuple, axis=1))
m = gmaps.Map()
heatmap_layer = gmaps.Heatmap(data=data)

heatmap_layer.max_intensity = 80
heatmap_layer.point_radius = 10

m.add_layer(heatmap_layer)
heatmap_layer.gradient = [
    (255, 255, 255, 0.01),
    (255, 165, 0, 0.4),
    (255, 69, 0, 0.6),
    (255, 0, 0, 0.8)
]
#heatmap_layer.gradient = rgba
m

## Normalized distribution of Power Outage related Tweets

In [None]:
# Min and max lat-lon for whole dataset
lon_min, lon_max = df['longitude'].values.min(), df['longitude'].values.max()
lat_min, lat_max = df['latitude'].values.min(), df['latitude'].values.max()

# Map Sandy related Twitter activity on Google Maps
# Define our longitude and latitude points
# Here we use only power outage related data
filtered_lats, filtered_lons = df_filt['latitude'].values, df_filt['longitude'].values

weights = 1.0/df_filt['county_tweet_count'].values
weights *= 10/max(weights)

df_temp = pd.DataFrame({'lat': filtered_lats, 'lon': filtered_lons, 'wt': weights})
df_temp.head()

data = list(df_temp.apply(tuple, axis=1))
m_weighted = gmaps.Map()
heatmap_layer = gmaps.WeightedHeatmap(data=data)
heatmap_layer.max_intensity = 1
heatmap_layer.point_radius = 10

m.add_layer(heatmap_layer)
heatmap_layer.gradient = [
    (255, 255, 255, 0.01),
    (255, 165, 0, 0.4),
    (255, 69, 0, 0.6),
    (255, 0, 0, 0.8)
]
m_weighted.add_layer(heatmap_layer)
m_weighted

# Map of Power Outage Tweets 
## Filtered using Naive Bayes Classifier (Monograms + Bigrams)
## Normalized at the County level using reverse Geocoding