In [5]:
hourly_query="""
match (t:Tweet_{0})<-[:POSTS]-(u:UserSupports{1}_{0}) \
where exists(t.mentions_Trump) \
or exists(t.mentions_Clinton) \
or exists(t.mentions_Sanders) \
or exists(t.mentions_Cruz) \
or exists(t.mentions_Rubio) \
or exists(t.topic)
WITH t.mentions_Trump as Trump, \
t.mentions_Clinton as Clinton, \
t.mentions_Sanders as Sanders, \
t.mentions_Cruz as Cruz, \
t.mentions_Rubio as Rubio, \
t.topic as topic, \
u.screen_name as user, "{1}" as supports, (toInt(t.timestamp_int) / 3600000) as hour_group, count(t) as frequency, \
MIN(t.timestamp_int) as mints \
RETURN user, supports, mints, Trump, Clinton, Sanders, Cruz, Rubio, \
topic, frequency \
order by frequency desc"""

hourly_header=['year', 'month', 'day', 'hour', 'user', 'supports', 'mints',\
               'Trump','Clinton','Sanders','Cruz','Rubio',\
               'topic','frequency']

candidates = ['Trump','Clinton','Sanders','Cruz','Rubio']


In [6]:
partitions = ['2015_10_11', '2015_10_12', '2015_10_13', '2015_10_14', '2015_10_15',\
              '2016_1_26', '2016_1_27', '2016_1_28', '2016_1_29', '2016_1_30', '2016_2_2', '2016_2_3', \
              '2016_2_4', '2016_2_5', '2016_2_6', '2016_2_7', '2016_2_8', '2016_2_9', '2016_2_10',\
              '2016_2_11', '2016_2_12', '2016_2_13', '2016_2_14', '2016_2_15', \
              '2016_2_23', '2016_2_24', '2016_2_25', '2016_2_26', '2016_2_27', \
              '2016_3_1', '2016_3_2','2016_3_3', '2016_3_4', '2016_3_5', '2016_3_6', '2016_3_7', \
              '2016_3_8', '2016_3_9', '2016_3_10', '2016_3_11']


In [7]:
from datetime import datetime

# Append the day and hour adjusted to EST (UTC-5)
# convert ms to s for python
def converttime(ts):
    ts_adjust = (ts/1000) - (3600*5)
    dt = datetime.utcfromtimestamp(ts_adjust)
    return dt.year, dt.month, dt.day, dt.hour

# When an item is retweeted, we keep a copy of the original tweet (based on retweet_status).
# This copy might be from prior to our day.  we need to remove this data from the final result.
def keepRetweetedItem(partition, ts):
    ts_adjust = ts / 1000
    dt = datetime.utcfromtimestamp(ts_adjust)
    year, month, day = partition.split('_')
    return ((int(year)==dt.year) and (int(month)==dt.month) and (int(day)==dt.day))

print converttime(1444607999000)
print keepRetweetedItem('2015_10_11',1444607999000)
print keepRetweetedItem('2015_10_11',1428188930000)



(2015, 10, 11, 18)
True
False


In [8]:
import time
import logging
import csv
from neo4jreader import neo4j_reader

#set up logging
logger = logging.getLogger('ResultsGroupedByUserSupporter')
hdlr = logging.FileHandler('./supporter_results_tagging.log')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr) 
logger.setLevel(logging.INFO)

reader = neo4j_reader('remote',logger)

with open('hourly_supporter_results.csv', 'wb') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(hourly_header)
    
    #throwaway=0
    
    for partition in partitions:
        
        for candidate in candidates:

            start = time.time()

            formatted_query = hourly_query.format(partition,candidate)
            #print formatted_query

            for record in reader.runquery(formatted_query):
                outrecord = []
                for i in range(len(record)):
                    outrecord.append(record[i])

                mints = int(outrecord[2])

                #if (keepRetweetedItem(partition, mints)):

                #some of the neutral items were labeled true instead of neu, correct these.
                #TODO: correct these in the database.
                for n in range(3,8):
                    if ("true" in str(outrecord[n]).lower()):
                        outrecord[n]="neu"

                #convert time from UTC to EST and add day/hour information to output
                year, month, day, hour = converttime(mints)
                outrecord = [year, month, day, hour] + outrecord

                csvwriter.writerow(outrecord)  

    #             else:
    #                 throwaway = throwaway + 1


        print partition, time.time() - start
        
    #print throwaway


        

2015_10_11 0.272280931473
2015_10_12 0.369301795959
2015_10_13 0.372049808502
2015_10_14 0.417307853699
2015_10_15 0.0804789066315
2016_1_26 0.610925912857
2016_1_27 0.522617816925
2016_1_28 0.487298965454
2016_1_29 0.691138029099
2016_1_30 0.485883951187
2016_2_2 0.0600309371948
2016_2_3 0.494955062866
2016_2_4 0.516025066376
2016_2_5 0.501459121704
2016_2_6 0.427948951721
2016_2_7 0.789217948914
2016_2_8 0.595231056213
2016_2_9 0.694847822189
2016_2_10 0.867651939392
2016_2_11 0.689497947693
2016_2_12 0.704519987106
2016_2_13 0.685979127884
2016_2_14 1.13120889664
2016_2_15 0.706348896027
2016_2_23 0.92063999176
2016_2_24 1.34331297874
2016_2_25 1.05647110939
2016_2_26 2.08269810677
2016_2_27 0.0589578151703
2016_3_1 1.28401184082
2016_3_2 1.14329504967
2016_3_3 1.30351781845
2016_3_4 1.29270505905
2016_3_5 1.22899484634
2016_3_6 0.827839136124
2016_3_7 0.559630870819
2016_3_8 0.934417009354
2016_3_9 0.940730810165
2016_3_10 0.660388946533
2016_3_11 0.150551080704


In [9]:
import pandas as pd

#verify counts by day
df = pd.read_csv('hourly_supporter_results.csv')
df.head()

Unnamed: 0,year,month,day,hour,user,supports,mints,Trump,Clinton,Sanders,Cruz,Rubio,topic,frequency
0,2015,10,11,18,azblonde2015,Trump,1444604521663,pos,,,,,,21
1,2015,10,11,18,distrumption,Trump,1444604724646,pos,,,,,,18
2,2015,10,11,17,azblonde2015,Trump,1444600829000,pos,,,,,,17
3,2015,10,11,18,skyjones55,Trump,1444604477443,pos,,,,,,16
4,2015,10,11,18,chriskyleband,Trump,1444604497385,pos,,,,,,13


In [10]:
df3a = df[['year', 'month', 'day', 'hour', 'user', 'supports', \
               'Trump','Clinton','Sanders','Cruz','Rubio',\
               'topic','frequency']]



In [11]:
df3 = df3a.reset_index()
df3.fillna('none',inplace=True)
df3

Unnamed: 0,index,year,month,day,hour,user,supports,Trump,Clinton,Sanders,Cruz,Rubio,topic,frequency
0,0,2015,10,11,18,azblonde2015,Trump,pos,none,none,none,none,none,21
1,1,2015,10,11,18,distrumption,Trump,pos,none,none,none,none,none,18
2,2,2015,10,11,17,azblonde2015,Trump,pos,none,none,none,none,none,17
3,3,2015,10,11,18,skyjones55,Trump,pos,none,none,none,none,none,16
4,4,2015,10,11,18,chriskyleband,Trump,pos,none,none,none,none,none,13
5,5,2015,10,11,18,woodseysmith,Trump,pos,none,none,none,none,none,12
6,6,2015,10,11,17,machadokirk,Trump,pos,none,none,none,none,none,11
7,7,2015,10,11,17,ocamericans,Trump,pos,none,none,none,none,none,9
8,8,2015,10,11,18,thetrumpetts,Trump,pos,none,none,none,none,none,9
9,9,2015,10,11,18,saskamare,Trump,pos,none,none,none,none,none,9


In [13]:
df4 = df3.groupby([df3['year'],df3['month'],df3['day'],df3['user'],df3['supports'],df3['Trump'],\
                   df3['Clinton'],df3['Sanders'],\
            df3['Cruz'],df3['Rubio'],df3['topic']]).sum().reset_index()
df4.head()

Unnamed: 0,year,month,day,user,supports,Trump,Clinton,Sanders,Cruz,Rubio,topic,index,hour,frequency
0,2006,12,22,so_n_so,Trump,neu,none,none,none,none,none,3931923,19,1
1,2008,5,6,antoniofrench,Clinton,none,neg,none,none,none,none,4115162,21,1
2,2009,5,1,sensanders,Sanders,none,none,neu,none,none,none,1809549,45,3
3,2009,5,4,realdonaldtrump,Trump,neg,none,none,none,none,none,22608378,130,10
4,2009,5,12,realdonaldtrump,Trump,pos,none,none,none,none,none,7842651,18,2


In [14]:
df4.to_csv('daily_supporter_results.csv')

In [None]:

# total = df2['frequency'].sum()
# print total

# with throwaway
# print total + 479103 #9172444