In [1]:
import pandas as pd
import numpy as np
from scipy import stats

## Load Data

In [2]:
def load_tweets(tweets_attributes_file, tweets_text_file):
    df = pd.read_csv(tweets_attributes_file, index_col=0)
    text = []
    with open(tweets_text_file, 'r') as f:
        for line in f:
            text.append(line)
    df['text'] = text
    return df

In [3]:
tweets_attributes_file = 'sandy_tweets_attributes_rev_geocoded_formatted_timestamps.csv'
tweets_text_file = 'sandy_tweets_text_tokenized.txt'

df = load_tweets(tweets_attributes_file, tweets_text_file)
#df = pd.read_csv(open('sandy_tweets_1.csv'), encoding='utf-8', engine='c')
df.head(2)

Unnamed: 0,tweet_id,user_id,retweet_count,time_stamp,longitude,latitude,state,county,county_tweet_count,sentiment,text
0,260244125050363904,295902181,0,2012-10-22 05:00:09,-74.078101,40.735218,New Jersey,Hudson County,52264,1,all i wish is to be better than yesterday and ...
1,260244177412042752,85314436,0,2012-10-22 05:00:21,-81.50579,33.460462,South Carolina,Barnwell County,1375,0,@imSunnyAF yesssss lawd\n


In [4]:
df['county'] = df['county'].replace(np.nan,'', regex=True)

In [5]:
# Extract hashtags and save them in a new column
#https://marcobonzanini.com/2015/03/17/mining-twitter-data-with-python-part-3-term-frequencies/
df['hashtags'] = df['text'].map(lambda x: [term for term in x.split() if term.startswith('#') and len(term)>1])
df.head(2)

Unnamed: 0,tweet_id,user_id,retweet_count,time_stamp,longitude,latitude,state,county,county_tweet_count,sentiment,text,hashtags
0,260244125050363904,295902181,0,2012-10-22 05:00:09,-74.078101,40.735218,New Jersey,Hudson County,52264,1,all i wish is to be better than yesterday and ...,[]
1,260244177412042752,85314436,0,2012-10-22 05:00:21,-81.50579,33.460462,South Carolina,Barnwell County,1375,0,@imSunnyAF yesssss lawd\n,[]


In [6]:
from collections import Counter
# Print the first 5 most frequent hashtags
hashtags = [item for sublist in df['hashtags'].tolist() for item in sublist]
hashtag_counts = Counter(hashtags)
#print (hashtag_counts.most_common(10))

df_hashtag_counts = pd.DataFrame(hashtag_counts.most_common(20), columns=['hashtags', 'count'])
df_hashtag_counts

Unnamed: 0,hashtags,count
0,#Sandy,15257
1,#sandy,13094
2,#oomf,8243
3,#HurricaneSandy,3973
4,#hurricanesandy,3710
5,#halloween,3670
6,#nyc,3368
7,#100ThingsAboutMe,3002
8,#Halloween,2842
9,#jobs,2565


In [7]:
from bokeh.io import output_notebook, show
output_notebook()

from bokeh.charts import Bar, output_file#, show
from bokeh.charts.attributes import cat, color
from bokeh.charts.operations import blend
from bokeh.charts.attributes import ColorAttr, CatAttr

#bar = Bar(df_monogram_tokens, 'monograms', values='count', title="test chart")
bar = Bar(df_hashtag_counts, values='count', label=CatAttr(columns=['hashtags'], sort=False),)
         # Turn off Bar Plot sorting by category axis labels
         # https://github.com/bokeh/bokeh/issues/2924
#output_file("most_common_hashtags_bokeh.html")

show(bar)

In [8]:
# Create a new dataframe containg tweets with only the most common hashtags
common_hashtags = df_hashtag_counts['hashtags'].tolist()

df['common_hashtag_flag'] = df['hashtags'].apply(lambda x: x!=[] and set(x).issubset(set(common_hashtags)))
df_filt = df[['time_stamp', 'hashtags']][df['common_hashtag_flag']==1]
df_filt['hashtags'] = df_filt['hashtags'].apply(tuple)
df = df.drop('common_hashtag_flag', axis=1)
print (len(df_filt))
df_filt.head()

41471


Unnamed: 0,time_stamp,hashtags
39,2012-10-22 05:00:03,"(#oomf,)"
696,2012-10-22 05:02:55,"(#sorrynotsorry,)"
1334,2012-10-22 05:06:05,"(#oomf,)"
1536,2012-10-22 05:07:11,"(#sorrynotsorry,)"
1909,2012-10-22 05:09:00,"(#oomf,)"


In [None]:
#https://github.com/pandas-dev/pandas/issues/10511
    
def listify(df, column):
    matches = [i for i,n in enumerate(df.columns)
               if n==column]

    if len(matches)==0:
        raise Exception('Failed to find column named ' + column +'!')
    if len(matches)>1:
        raise Exception('More than one column named ' + column +'!')

    old_index = df.index
    col_idx = matches[0] + len(old_index.shape) # Since we will reset the index

    column_names = list(df.index.names) + list(df.columns)
    gb_cols = [c for c in column_names
               if c!= column]

    # Helper function to generate the squashed dataframe
    def fnc(d):
        row = list(d.values[0])
        return pd.DataFrame([row[:col_idx]
                             + [[v[col_idx] for v in list(d.values)]]
                             + row[col_idx+1:]])

    return (df
            .reset_index()
            .groupby(gb_cols)
            .apply(fnc)
            .rename(columns = lambda i : column_names[i])
            .set_index(old_index.names)
           )

def unlistify(df, column):
    matches = [i for i,n in enumerate(df.columns)
               if n==column]

    if len(matches)==0:
        raise Exception('Failed to find column named ' + column +'!')
    if len(matches)>1:
        raise Exception('More than one column named ' + column +'!')

    col_idx = matches[0]

    # Helper function to expand and repeat the column col_idx
    def fnc(d):
        row = list(d.values[0])
        bef = row[:col_idx]
        aft = row[col_idx+1:]
        col = row[col_idx]
        z = [bef + [c] + aft for c in col]
        return pd.DataFrame(z)

    col_idx += len(df.index.shape) # Since we will push reset the index
    index_names = list(df.index.names)
    column_names = list(index_names) + list(df.columns)
    return (df
            .reset_index()
            .groupby(level=0,as_index=0)
            .apply(fnc)
            .rename(columns = lambda i :column_names[i])
            .set_index(index_names)
           )


# Examples of how to listify and unlistify a column.
#df_test = pd.DataFrame([[11,range(5),10],
#                   [22,range(3),20]],
#                   columns = ['A','B','C']).set_index('C')
#print ('org')
#print (df_test)
#print ('--')
#df_test = unlistify(df_test,'B')
#print ('unlistify(df_test,B)')
#print (df_test)
#print ('--')
#df_test = listify(df_test,'B')
#print ('listify(df_test,B)')
#print (df_test)

In [None]:
df_filt = unlistify(df_filt, 'hashtags')

In [None]:
# Extract date from datetime and save as a new column
df_filt['date'] = df_filt['time_stamp'].apply(lambda x: x.split()[0])

In [None]:
df_filt.groupby(['date', 'hashtags']).size()

In [None]:
grouped_df = df_filt.groupby(['date', 'hashtags'])
df_heatmap = pd.DataFrame(grouped_df.size().reset_index(name = "Group_Count"))
df_heatmap.head()

In [None]:
# http://bokeh.pydata.org/en/latest/docs/gallery/heatmap_chart.html

from bokeh.io import output_notebook, show
output_notebook()
from bokeh.charts import HeatMap, bins, output_file, show
#from bokeh.layouts import column, gridplot
from bokeh.palettes import RdYlGn6, RdYlGn9

In [None]:
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper, LogColorMapper
from bokeh.plotting import figure
from math import pi

In [None]:
##http://stackoverflow.com/questions/33596491/extract-matplotlib-colormap-in-hex-format
#
#from matplotlib import cm, colors
#cmap = cm.get_cmap('Oranges', 10)    # PiYG
#
#for i in range(cmap.N):
#    rgb = cmap(i)[:3] # will return rgba, we take only first 3 so we get rgb
#    print '"%s",' % colors.rgb2hex(rgb), 

In [None]:
output_file("sandy_hashtag_heatmap_1.html", title="Hashtags in the Sandy corpus")

# this is the colormap from the original NYTimes plot
#colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
colors = ["#fff5eb", "#fee8d1", "#fdd5ac", "#fdb97d", "#fd9c51", "#f87d2a", "#e95e0d", "#ce4401", "#a23403", "#7f2704"]
mapper = LogColorMapper(palette=colors)
#mapper = LinearColorMapper(palette=colors)

df_heatmap_mat = df_heatmap.pivot(index='date', columns='hashtags', values='Group_Count')
df_heatmap_mat = df_heatmap_mat.fillna(0)

date = []
hashtag = []
color = []
count = []
for d in list(df_heatmap_mat.index):
    for h in list(df_heatmap_mat.columns):
        hashtag.append(h)
        date.append(d)
        count_hd = df_heatmap_mat[h].loc[d]
        count.append(count_hd)
        
source = ColumnDataSource(data=dict(date=date, hashtag=hashtag, count=count))

TOOLS = "hover,save,pan,box_zoom,wheel_zoom"
p = figure(#title="Hashtags",
           y_range=list(df_heatmap_mat.columns), x_range=list(df_heatmap_mat.index),
           x_axis_location="below", plot_width=900, plot_height=400,
           tools=TOOLS)

p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "10pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = pi / 3


p.rect(y="hashtag", x="date", width=1, height=1,
       source=source,
       fill_color={'field': 'count', 'transform': mapper},
       line_color=None)

p.select_one(HoverTool).tooltips = [
    ('date', '@date'),
    ('hashtag', '@hashtag'),
    ('count', '@count'),
]

show(p)