In [1]:
import pandas as pd
import numpy as np
from scipy import stats

from weighted_kde import *
import tweet_tokenizer

In [2]:
def load_tweets(tweets_attributes_file, tweets_text_file):
    df = pd.read_csv(tweets_attributes_file, index_col=0)
    text = []
    with open(tweets_text_file, 'r') as f:
        for line in f:
            text.append(line)
    df['text'] = text
    return df

In [3]:
# Load tweet data
tweets_attributes_file = 'sandy_tweets_attributes_rev_geocoded_formatted_timestamps.csv'
tweets_text_file = 'sandy_tweets_text_tokenized.txt'

df = load_tweets(tweets_attributes_file, tweets_text_file)
#df = pd.read_csv(open('sandy_tweets_1.csv'), encoding='utf-8', engine='c')
df.head()

Unnamed: 0,tweet_id,user_id,retweet_count,time_stamp,longitude,latitude,state,county,text
0,260244125050363904,295902181,0,2012-10-22 05:00:09,-74.078101,40.735218,New Jersey,Hudson County,all i wish is to be better than yesterday and ...
1,260244177412042752,85314436,0,2012-10-22 05:00:21,-81.50579,33.460462,South Carolina,Barnwell County,@imSunnyAF yesssss lawd\n
2,260244177105850368,239968255,0,2012-10-22 05:00:21,-77.099999,39.344184,Maryland,Carroll County,"""Waiting for something better , a better you ...."
3,260244156729942016,703352862,0,2012-10-22 05:00:17,-80.90747,39.618102,West Virginia,Tyler County,Cool right ? #plainoldshirt http://t.co/B55dMS...
4,260244145694728192,581488152,0,2012-10-22 05:00:14,-76.579826,39.81645,Pennsylvania,York County,#10PeopleYouTrulyLove My son\n


In [4]:
len(df)

4779087

In [5]:
# Replace missing value in 'county' column by empty string. e.g. Washington D.C.
df['county'] = df['county'].replace(np.nan,'', regex=True)

In [6]:
# Groupby state-conunty pairs and save counts in a new column
# See http://stackoverflow.com/questions/17432944/python-pandas-error-when-doing-groupby-counts

#df['county_tweet_count'] = df.groupby(['state', 'county']).transform('count')
df['county_tweet_count'] = df.groupby(['state', 'county'])['tweet_id'].transform('count')

# Test print
print (len(df[df['state']+df['county'] == 'New JerseyHudson County']))
df.head(6)

52264


Unnamed: 0,tweet_id,user_id,retweet_count,time_stamp,longitude,latitude,state,county,text,county_tweet_count
0,260244125050363904,295902181,0,2012-10-22 05:00:09,-74.078101,40.735218,New Jersey,Hudson County,all i wish is to be better than yesterday and ...,52264
1,260244177412042752,85314436,0,2012-10-22 05:00:21,-81.50579,33.460462,South Carolina,Barnwell County,@imSunnyAF yesssss lawd\n,1375
2,260244177105850368,239968255,0,2012-10-22 05:00:21,-77.099999,39.344184,Maryland,Carroll County,"""Waiting for something better , a better you ....",7707
3,260244156729942016,703352862,0,2012-10-22 05:00:17,-80.90747,39.618102,West Virginia,Tyler County,Cool right ? #plainoldshirt http://t.co/B55dMS...,175
4,260244145694728192,581488152,0,2012-10-22 05:00:14,-76.579826,39.81645,Pennsylvania,York County,#10PeopleYouTrulyLove My son\n,20191
5,260244141139701760,80608282,1,2012-10-22 05:00:13,-84.472785,39.147755,Ohio,Hamilton County,Mortal kombat ! ! ! ! ! ! @JoeMoDavis #happybi...,51632


In [7]:
# Filter out Sandy related tweets'
#sandy_keywords = ['sandy', 'hurricane', 'storm', 'frankenstorm', 
#                  'power', 'no power', 'blackout',
#                  'gas', 'flooding', 'recovery', 
#                  'weather', 'climate', 'climate change', 'stay safe', 'FEMA']
#sandy_keywords = ['sandy', 'hurricane', 'hurricanesandy', 'frankenstorm', 
#                  'power outage', 'no power', 'blackout', 'no electricity', 'no light',
#                  'no gas', 'flooding',
#                  'climate change', 'fema', 'red cross']
sandy_keywords = ['sandy', 'hurricane', 'hurricanesandy', 'storm', 'frankenstorm']
sandy_keywords = sandy_keywords + ['#'+kw for kw in sandy_keywords]

pattern = ' ' + ' | '.join(sandy_keywords) + ' '
pattern
df_filt = df[df['text'].str.contains(pattern)]

In [8]:
print ('Total number of Sandy related tweets: %d' % len(df_filt))

Total number of Sandy related tweets: 54105


In [9]:
df_filt.head()

Unnamed: 0,tweet_id,user_id,retweet_count,time_stamp,longitude,latitude,state,county,text,county_tweet_count
2233,260259374579204096,555599291,0,2012-10-22 06:00:44,-77.383514,37.129962,Virginia,City of Petersburg,Tribe storm :)\n,2307
3719,260276715958464512,741091694,0,2012-10-22 07:09:39,-79.147932,37.402037,Virginia,City of Lynchburg,"""Talkin to these kids , making laugh up a stor...",9961
3943,260290515403145216,360388255,0,2012-10-22 08:04:29,-80.015212,40.430596,Pennsylvania,Allegheny County,All I see is a storm that you'll get lost in ....,77561
7303,260350662867562497,529698640,0,2012-10-22 12:03:29,-75.238252,39.953538,Pennsylvania,Delaware County,http://t.co/BhhL8usJ praise GOD n the storm . ...,53618
10147,260366743153823745,39651540,0,2012-10-22 13:07:23,-75.58601,39.94597,Pennsylvania,Chester County,@BigJoeBastardi what are water temps off e coa...,20863


## Google map plots

In [10]:
import gmaps
import gmaps.datasets

gmaps.configure(api_key="AIzaSyAWVuwTc9I0KC6jskl6_yZwOzhhG8J3MsA") # Your Google API key

In [11]:
data = list(df_filt[['latitude', 'longitude']].apply(tuple, axis=1))
data[:3]

[(37.129962140000004, -77.383514340000005),
 (37.402037, -79.147931900000003),
 (40.430596350000002, -80.015212329999997)]

In [24]:
from matplotlib import cm

rgba = []
for i in xrange(255):
    rgba_val = list(cm.jet(i))
    rgba_val = [int(v*255) for v in rgba_val]
    rgba_val[3] = 0.99*(1-i/256.0)
    rgba.append(tuple(rgba_val))
    
rgba

[(0, 0, 127, 0.99),
 (0, 0, 132, 0.9861328125),
 (0, 0, 136, 0.982265625),
 (0, 0, 141, 0.9783984375),
 (0, 0, 145, 0.97453125),
 (0, 0, 150, 0.9706640625),
 (0, 0, 154, 0.966796875),
 (0, 0, 159, 0.9629296875),
 (0, 0, 163, 0.9590625),
 (0, 0, 168, 0.9551953125),
 (0, 0, 172, 0.951328125),
 (0, 0, 177, 0.9474609375),
 (0, 0, 182, 0.94359375),
 (0, 0, 186, 0.9397265625),
 (0, 0, 191, 0.9358593749999999),
 (0, 0, 195, 0.9319921875),
 (0, 0, 200, 0.928125),
 (0, 0, 204, 0.9242578124999999),
 (0, 0, 209, 0.920390625),
 (0, 0, 213, 0.9165234375),
 (0, 0, 218, 0.91265625),
 (0, 0, 222, 0.9087890625),
 (0, 0, 227, 0.904921875),
 (0, 0, 232, 0.9010546875),
 (0, 0, 236, 0.8971875),
 (0, 0, 241, 0.8933203125),
 (0, 0, 245, 0.889453125),
 (0, 0, 250, 0.8855859375),
 (0, 0, 254, 0.88171875),
 (0, 0, 255, 0.8778515625),
 (0, 0, 255, 0.873984375),
 (0, 0, 255, 0.8701171875),
 (0, 0, 255, 0.86625),
 (0, 4, 255, 0.8623828125),
 (0, 8, 255, 0.858515625),
 (0, 12, 255, 0.8546484375),
 (0, 16, 255, 0.85

In [25]:
# Min and max lat-lon for whole dataset
lon_min, lon_max = df['longitude'].values.min(), df['longitude'].values.max()
lat_min, lat_max = df['latitude'].values.min(), df['latitude'].values.max()

# Map Sandy related Twitter activity on Google Maps
# Define our longitude and latitude points
# Here we use only power outage related data
filtered_lon_vals, filtered_lat_vals = df_filt['longitude'].values, df_filt['latitude'].values

weights = 1.0/df_filt['county_tweet_count'].values

data = list(df_filt[['latitude', 'longitude']].apply(tuple, axis=1))
m = gmaps.Map()
heatmap_layer = gmaps.Heatmap(data=data)

#heatmap_layer.max_intensity = 10
heatmap_layer.point_radius = 20

m.add_layer(heatmap_layer)
#heatmap_layer.gradient = [
#    (255, 255, 255, 0.01),
#    (255, 165, 0, 0.4),
#    (255, 69, 0, 0.6),
#    (255, 0, 0, 0.8)
#]
heatmap_layer.gradient = rgba
m

In [33]:
print (heatmap_layer.max_intensity)

None


In [31]:
# Min and max lat-lon for whole dataset
lon_min, lon_max = df['longitude'].values.min(), df['longitude'].values.max()
lat_min, lat_max = df['latitude'].values.min(), df['latitude'].values.max()

# Map Sandy related Twitter activity on Google Maps
# Define our longitude and latitude points
# Here we use only power outage related data
filtered_lats, filtered_lons = df_filt['latitude'].values, df_filt['longitude'].values

weights = 1.0/df_filt['county_tweet_count'].values
weights *= 100/max(weights)

df_temp = pd.DataFrame({'lat': filtered_lats, 'lon': filtered_lons, 'wt': weights})
df_temp.head()

data = list(df_temp.apply(tuple, axis=1))
m_weighted = gmaps.Map()
heatmap_layer = gmaps.WeightedHeatmap(data=data)
#heatmap_layer.max_intensity = 10
heatmap_layer.point_radius = 20
m_weighted.add_layer(heatmap_layer)
m_weighted