### Create Visualization Data Time Series

In [59]:
import pandas as pd

# Read in the grouped data from Neo4j (see workbook for ResultsGroupedByCandidateTopic)

df = pd.read_csv("./hourly_results.csv")
df.head()

Unnamed: 0,day,hour,Trump,Clinton,Sanders,Cruz,Rubio,topic,frequency,users,mints,maxts
0,2015_10_11,19,neu,,,,,,1,1,1352229456000,1352229456000
1,2015_10_11,1,,neu,,,,,1,1,1360374148000,1360374148000
2,2015_10_11,1,,neg,,,,,1,1,1362445434000,1362445434000
3,2015_10_11,18,,neu,,,,,1,1,1363112126000,1363112126000
4,2015_10_11,5,,pos,,,,,1,1,1372224919000,1372224919000


In [60]:
from datetime import datetime

# Append the day and hour adjusted to EST (UTC-5)
# convert ms to s for python
def converttime(ts):
    ts_adjust = (ts/1000) - (3600*5)
    dt = datetime.utcfromtimestamp(ts_adjust)
    return dt

def convertday(ts):
    dt = converttime(ts)
    return dt.day

def convertmonth(ts):
    dt = converttime(ts)
    return dt.month

def convertyear(ts):
    dt = converttime(ts)
    return dt.year

def converthour(ts):
    dt = converttime(ts)
    return dt.hour

# When an item is retweeted, we keep a copy of the original tweet (based on retweet_status).
# This copy might be from prior to our day.  we need to remove this data from the final result.
def keepRetweetedItem(partition, ts):
    ts_adjust = ts / 1000
    dt = datetime.utcfromtimestamp(ts_adjust)
    year, month, day = partition.split('_')
    return ((int(year)==dt.year) and (int(month)==dt.month) and (int(day)==dt.day))

print converttime(1444607999000)
print convertday(1444607999000)
print convertmonth(1444607999000)
print convertyear(1444607999000)
print converthour(1444607999000)
print keepRetweetedItem('2015_10_11',1444607999000)
print keepRetweetedItem('2015_10_11',1428188930000)


2015-10-11 18:59:59
11
10
2015
18
True
False


In [61]:
df['x_day'] = df['mints'].apply(convertday)
df['month'] = df['mints'].apply(convertmonth)
df['year'] = df['mints'].apply(convertyear)
df['hour'] = df['mints'].apply(converthour)
df.head()

Unnamed: 0,day,hour,Trump,Clinton,Sanders,Cruz,Rubio,topic,frequency,users,mints,maxts,x_day,month,year
0,2015_10_11,14,neu,,,,,,1,1,1352229456000,1352229456000,6,11,2012
1,2015_10_11,20,,neu,,,,,1,1,1360374148000,1360374148000,8,2,2013
2,2015_10_11,20,,neg,,,,,1,1,1362445434000,1362445434000,4,3,2013
3,2015_10_11,13,,neu,,,,,1,1,1363112126000,1363112126000,12,3,2013
4,2015_10_11,0,,pos,,,,,1,1,1372224919000,1372224919000,26,6,2013


In [62]:
#filter out retweets from prior days
df = df[df.apply(lambda x: keepRetweetedItem(x['day'],x['mints']), axis=1)]

In [63]:
df.head()

Unnamed: 0,day,hour,Trump,Clinton,Sanders,Cruz,Rubio,topic,frequency,users,mints,maxts,x_day,month,year
934,2015_10_11,19,,,,,,obama,13,1,1444522379000,1444522379000,10,10,2015
935,2015_10_11,19,,neu,,,,black lives matter,11,2,1444522724000,1444522804000,10,10,2015
936,2015_10_11,19,,neu,,,,,10,3,1444522402000,1444524909000,10,10,2015
937,2015_10_11,19,neu,,,,,,9,5,1444521727000,1444524664000,10,10,2015
938,2015_10_11,19,pos,,,,,,4,3,1444521809000,1444523107000,10,10,2015


In [64]:
#unpivot data for candidates
df2 = pd.melt(df, id_vars=['x_day','month','year','hour','topic','frequency','mints'], \
    value_vars=['Trump','Clinton','Sanders','Cruz','Rubio'], \
    var_name='candidate', value_name='sentiment')



In [65]:
df2.head()

Unnamed: 0,x_day,month,year,hour,topic,frequency,mints,candidate,sentiment
0,10,10,2015,19,obama,13,1444522379000,Trump,
1,10,10,2015,19,black lives matter,11,1444522724000,Trump,
2,10,10,2015,19,,10,1444522402000,Trump,
3,10,10,2015,19,,9,1444521727000,Trump,neu
4,10,10,2015,19,,4,1444521809000,Trump,pos


In [66]:
#drop rows where sentiment is not available
df2.dropna(subset=['sentiment'], inplace=True)

In [67]:
df2.head()

Unnamed: 0,x_day,month,year,hour,topic,frequency,mints,candidate,sentiment
3,10,10,2015,19,,9,1444521727000,Trump,neu
4,10,10,2015,19,,4,1444521809000,Trump,pos
18,10,10,2015,19,,1,1444524483000,Trump,neu
24,10,10,2015,20,,7,1444525694000,Trump,neu
34,10,10,2015,20,,1,1444528680000,Trump,pos


In [68]:
df2.sort(['year','month','x_day','hour','candidate','topic','sentiment'], inplace=True)

In [69]:
df2.to_csv('scrubbed_results2.csv', index=False)

In [70]:
#number should be a little higher than original file because some tweets have multiple candidate references.
print df2['frequency'].sum()



100763325


In [71]:
print df['frequency'].sum()

86051876


In [None]:
df3 = df2.groupby()