In [93]:
hourly_query="""match (t:Tweet_{0}) \
where exists(t.mentions_Trump) \
or exists(t.mentions_Clinton) \
or exists(t.mentions_Sanders) \
or exists(t.mentions_Cruz) \
or exists(t.mentions_Rubio) \
or exists(t.mentions_Kasich) \
or exists(t.mentions_Omally) \
or exists(t.mentions_Carson) \
or exists(t.mentions_Fiorina) \
or exists(t.mentions_Huckabee) \
or exists(t.mentions_Paul) \
or exists(t.mentions_Christie) \
or exists(t.mentions_Bush) \
or exists(t.topic) \
WITH t.mentions_Trump as Trump, \
t.mentions_Clinton as Clinton, \
t.mentions_Sanders as Sanders, \
t.mentions_Cruz as Cruz, \
t.mentions_Rubio as Rubio, \
t.mentions_Kasich as Kasich, \
t.mentions_Omally as Omally, \
t.mentions_Christie as Christie, \
t.mentions_Paul as Paul, \
t.mentions_Carson as Carson, \
t.mentions_Fiorina as Fiorina, \
t.mentions_Huckabee as Huckabee, \
t.mentions_Bush as Bush, \
t.topic as topic, \
(toInt(t.timestamp_int) / 3600000) as hour_group, count(t) as frequency, \
MIN(t.timestamp_int) as mints \
RETURN mints, Trump, Clinton, Sanders, Cruz, Rubio, \
Kasich, Omally, Christie, Paul, Carson, Fiorina, Huckabee, Bush, topic, frequency \
order by frequency desc"""

hourly_header=['year', 'month', 'day', 'hour', 'mints',\
               'Trump','Clinton','Sanders','Cruz','Rubio',\
               'Kasich', 'Omally', 'Christie', 'Paul', 'Carson', 'Fiorina', 'Huckabee', 'Bush',\
               'topic','frequency']




In [94]:
partitions = ['2015_10_11', '2015_10_12', '2015_10_13', '2015_10_14', '2015_10_15',\
              '2016_1_26', '2016_1_27', '2016_1_28', '2016_1_29', '2016_1_30', '2016_2_2', '2016_2_3', \
              '2016_2_4', '2016_2_5', '2016_2_6', '2016_2_7', '2016_2_8', '2016_2_9', '2016_2_10',\
              '2016_2_11', '2016_2_12', '2016_2_13', '2016_2_14', '2016_2_15', \
              '2016_2_23', '2016_2_24', '2016_2_25', '2016_2_26', '2016_2_27', \
              '2016_3_1', '2016_3_2','2016_3_3', '2016_3_4', '2016_3_5', '2016_3_6', '2016_3_7', \
              '2016_3_8', '2016_3_9', '2016_3_10', '2016_3_11']


In [95]:
from datetime import datetime

# Append the day and hour adjusted to EST (UTC-5)
# convert ms to s for python
def converttime(ts):
    ts_adjust = (ts/1000) - (3600*5)
    dt = datetime.utcfromtimestamp(ts_adjust)
    return dt.year, dt.month, dt.day, dt.hour

# When an item is retweeted, we keep a copy of the original tweet (based on retweet_status).
# This copy might be from prior to our day.  we need to remove this data from the final result.
def keepRetweetedItem(partition, ts):
    ts_adjust = ts / 1000
    dt = datetime.utcfromtimestamp(ts_adjust)
    year, month, day = partition.split('_')
    return ((int(year)==dt.year) and (int(month)==dt.month) and (int(day)==dt.day))

print converttime(1444607999000)
print keepRetweetedItem('2015_10_11',1444607999000)
print keepRetweetedItem('2015_10_11',1428188930000)



(2015, 10, 11, 18)
True
False


In [96]:
import time
import logging
import csv
from neo4jreader import neo4j_reader

#set up logging
logger = logging.getLogger('ResultsGroupedByCandidateTopic2')
hdlr = logging.FileHandler('./results2_tagging.log')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr) 
logger.setLevel(logging.INFO)

reader = neo4j_reader('local',logger)

with open('hourly_results.csv', 'wb') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(hourly_header)
    
    for partition in partitions:

        start = time.time()

        formatted_query = hourly_query.format(partition)
        #print formatted_query

        for record in reader.runquery(formatted_query):
            outrecord = []
            for i in range(len(record)):
                outrecord.append(record[i])

            mints = int(outrecord[0])
                
            #some of the neutral items were labeled true instead of neu, correct these.
            #TODO: correct these in the database.
            for n in range(1,14):
                if ("true" in str(outrecord[n]).lower()):
                    outrecord[n]="neu"

            #convert time from UTC to EST and add day/hour information to output
            year, month, day, hour = converttime(mints)
            outrecord = [year, month, day, hour] + outrecord

            csvwriter.writerow(outrecord)  

        print partition, time.time() - start


        

2015_10_11 1.88331580162
2015_10_12 10.3455529213
2015_10_13 13.5996718407
2015_10_14 24.5881140232
2015_10_15 3.78546714783
2016_1_26 21.4532818794
2016_1_27 23.9675559998
2016_1_28 22.2140920162
2016_1_29 25.5507919788
2016_1_30 19.1482348442
2016_2_2 34.2794377804
2016_2_3 24.9862120152
2016_2_4 24.7666580677
2016_2_5 25.0177669525
2016_2_6 19.1715829372
2016_2_7 27.1885039806
2016_2_8 20.0513401031
2016_2_9 24.7409489155
2016_2_10 32.201346159
2016_2_11 23.5436930656
2016_2_12 26.7075150013
2016_2_13 20.6301300526
2016_2_14 27.6933429241
2016_2_15 20.5767540932
2016_2_23 25.4139909744
2016_2_24 32.607874155
2016_2_25 30.0303099155
2016_2_26 36.9787230492
2016_2_27 31.3748011589
2016_3_1 36.8130772114
2016_3_2 44.0164160728
2016_3_3 38.6327681541
2016_3_4 39.6288151741
2016_3_5 31.7046039104
2016_3_6 32.2526547909
2016_3_7 22.782335043
2016_3_8 30.5166270733
2016_3_9 33.8757359982
2016_3_10 28.2182650566
2016_3_11 4.74613690376


In [103]:
import pandas as pd

#verify counts by hour
df = pd.read_csv('hourly_results.csv')
df

Unnamed: 0,year,month,day,hour,mints,Trump,Clinton,Sanders,Cruz,Rubio,Kasich,Omally,Christie,Paul,Carson,Fiorina,Huckabee,Bush,topic,frequency
0,2015,10,11,18,1444604401832,neg,,,,,,,,,,,,,,6566
1,2015,10,11,18,1444604400152,,neu,,,,,,,,,,,,,3791
2,2015,10,11,18,1444604401312,neu,,,,,,,,,,,,,,3353
3,2015,10,11,17,1444600800000,neu,,,,,,,,,,,,,,2292
4,2015,10,11,17,1444600811000,,neu,,,,,,,,,,,,,1842
5,2015,10,11,18,1444604400679,,,neu,,,,,,,,,,,,1532
6,2015,10,11,18,1444604403653,pos,,,,,,,,,,,,,,1248
7,2015,10,11,17,1444600816000,,,neu,,,,,,,,,,,,1067
8,2015,10,11,18,1444604402717,,neg,,,,,,,,,,,,,1048
9,2015,10,11,18,1444604400625,,,,,,,,,,neu,,,,,1015


In [98]:
df_filtered = df[['year','month','day','topic','frequency']].dropna(subset=['topic'])
print df_filtered.head()

df2 = df_filtered['frequency'].groupby([df_filtered['year'],df_filtered['month'],df_filtered['day']]).sum().reset_index()
df2.head()

     year  month  day               topic  frequency
50   2015     10   11            abortion         31
72   2015     10   11            benghazi         21
75   2015     10   11            abortion         20
85   2015     10   11  black lives matter         18
111  2015     10   11            benghazi         13


Unnamed: 0,year,month,day,frequency
0,2006,3,21,1
1,2006,3,30,1
2,2006,9,8,1
3,2008,2,9,1
4,2008,2,21,1


In [99]:

total = df2['frequency'].sum()
print total

# with throwaway
# print total + 479103 #9172444

8930876


In [100]:
# replace NaN with 'none' otherwise the unpivot will exclude rows where anything is NaN


In [115]:
#add extra columns to support total counts for topics and sentiment
df['count_topic']=df['topic']

df['count_sentiment']=-((df['Trump'].isnull()) & (df['Clinton'].isnull()) & (df['Sanders'].isnull()) &\
                       (df['Cruz'].isnull()) & (df['Rubio'].isnull()) & (df['Kasich'].isnull()) & (df['Omally'].isnull()) &\
                        (df['Christie'].isnull()) & (df['Paul'].isnull()) & (df['Carson'].isnull()) &  \
                        (df['Fiorina'].isnull()) &  (df['Huckabee'].isnull()) & (df['Bush'].isnull()))
df.reset_index(inplace=True)

In [116]:
#unpivot the data

df5 = pd.melt(df, id_vars=['year','month','day','hour','topic','frequency'], \
    value_vars=['count_topic','count_sentiment'], \
    var_name='candidate_or_topic', value_name='sentiment_or_topic')
df5

#value_vars=['Trump','Clinton','Sanders','Cruz','Rubio','count_topic','count_sentiment'], \

Unnamed: 0,year,month,day,hour,topic,frequency,candidate_or_topic,sentiment_or_topic
0,2015,10,11,18,,6566,count_topic,
1,2015,10,11,18,,3791,count_topic,
2,2015,10,11,18,,3353,count_topic,
3,2015,10,11,17,,2292,count_topic,
4,2015,10,11,17,,1842,count_topic,
5,2015,10,11,18,,1532,count_topic,
6,2015,10,11,18,,1248,count_topic,
7,2015,10,11,17,,1067,count_topic,
8,2015,10,11,18,,1048,count_topic,
9,2015,10,11,18,,1015,count_topic,


In [117]:
df5a = df5.dropna(subset=['sentiment_or_topic'])
print df5a.head()

df5a[df5a['candidate_or_topic']=='count_topic']

     year  month  day  hour               topic  frequency candidate_or_topic  \
50   2015     10   11    18            abortion         31        count_topic   
72   2015     10   11    18            benghazi         21        count_topic   
75   2015     10   11    17            abortion         20        count_topic   
85   2015     10   11    18  black lives matter         18        count_topic   
111  2015     10   11    17            benghazi         13        count_topic   

     sentiment_or_topic  
50             abortion  
72             benghazi  
75             abortion  
85   black lives matter  
111            benghazi  


Unnamed: 0,year,month,day,hour,topic,frequency,candidate_or_topic,sentiment_or_topic
50,2015,10,11,18,abortion,31,count_topic,abortion
72,2015,10,11,18,benghazi,21,count_topic,benghazi
75,2015,10,11,17,abortion,20,count_topic,abortion
85,2015,10,11,18,black lives matter,18,count_topic,black lives matter
111,2015,10,11,17,benghazi,13,count_topic,benghazi
112,2015,10,11,18,women's issues (not abortion though),13,count_topic,women's issues (not abortion though)
114,2015,10,11,17,billclinton,12,count_topic,billclinton
121,2015,10,11,17,benghazi,11,count_topic,benghazi
125,2015,10,11,18,obama,11,count_topic,obama
127,2015,10,11,17,black lives matter,11,count_topic,black lives matter


In [120]:
df5a.to_csv('hourly_results_unpivoted.csv')

In [119]:
df5a[df5a['candidate_or_topic']=='count_sentiment']['frequency'].sum()

64812597