In [1]:
hourly_query="""
match (t:Tweet_{0})<-[:POSTS]-(u:User_{0}) \
WITH u.screen_name as user, labels(u)[1] as supports, \
(toInt(t.timestamp_int) / 3600000) as hour_group, count(t) as frequency, \
MIN(t.timestamp_int) as mints \
RETURN user, supports, frequency, mints \
order by mints, frequency desc"""

hourly_header=['year', 'month', 'day', 'hour', 'user', 'supports', 'frequency', 'mints']



In [2]:
partitions = ['2015_10_11', '2015_10_12', '2015_10_13', '2015_10_14', '2015_10_15',\
              '2016_1_26', '2016_1_27', '2016_1_28', '2016_1_29', '2016_1_30', '2016_2_2', '2016_2_3', \
              '2016_2_4', '2016_2_5', '2016_2_6', '2016_2_7', '2016_2_8', '2016_2_9', '2016_2_10',\
              '2016_2_11', '2016_2_12', '2016_2_13', '2016_2_14', '2016_2_15', \
              '2016_2_23', '2016_2_24', '2016_2_25', '2016_2_26', '2016_2_27', \
              '2016_3_1', '2016_3_2','2016_3_3', '2016_3_4', '2016_3_5', '2016_3_6', '2016_3_7', \
              '2016_3_8', '2016_3_9', '2016_3_10', '2016_3_11']


In [3]:
from datetime import datetime

# Append the day and hour adjusted to EST (UTC-5)
# convert ms to s for python
def converttime(ts):
    ts_adjust = (ts/1000) - (3600*5)
    dt = datetime.utcfromtimestamp(ts_adjust)
    return dt.year, dt.month, dt.day, dt.hour

# When an item is retweeted, we keep a copy of the original tweet (based on retweet_status).
# This copy might be from prior to our day.  we need to remove this data from the final result.
def keepRetweetedItem(partition, ts):
    ts_adjust = ts / 1000
    dt = datetime.utcfromtimestamp(ts_adjust)
    year, month, day = partition.split('_')
    return ((int(year)==dt.year) and (int(month)==dt.month) and (int(day)==dt.day))

print converttime(1444607999000)
print keepRetweetedItem('2015_10_11',1444607999000)
print keepRetweetedItem('2015_10_11',1428188930000)



(2015, 10, 11, 18)
True
False


In [5]:
import time
import logging
import csv
from neo4jreader import neo4j_reader

#set up logging
logger = logging.getLogger('TotalTweetsPerUser')
hdlr = logging.FileHandler('./user_total.log')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr) 
logger.setLevel(logging.INFO)

reader = neo4j_reader('local',logger)

with open('total_user_results.csv', 'wb') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(hourly_header)
    
    for partition in partitions:
    #for partition in ['2015_10_12']:

        start = time.time()

        formatted_query = hourly_query.format(partition)
        #print formatted_query

        for record in reader.runquery(formatted_query):
            outrecord = []
            for i in range(len(record)):
                outrecord.append(record[i])

            mints = int(outrecord[3])
                
            #convert time from UTC to EST and add day/hour information to output
            year, month, day, hour = converttime(mints)
            outrecord = [year, month, day, hour] + outrecord

            csvwriter.writerow(outrecord)  


        print partition, time.time() - start
        

2015_10_11 3.5645468235
2015_10_12 40.2831630707
2015_10_13 48.1056950092
2015_10_14 84.088643074
2015_10_15 12.0594758987
2016_1_26 68.8410768509
2016_1_27 63.7389481068
2016_1_28 57.2371370792
2016_1_29 70.6742260456
2016_1_30 47.133742094
2016_2_2 102.699661016
2016_2_3 65.3797810078
2016_2_4 63.3711929321
2016_2_5 77.9141759872
2016_2_6 47.3914499283
2016_2_7 76.3006231785
2016_2_8 51.7149670124
2016_2_9 61.8810970783
2016_2_10 91.382434845
2016_2_11 58.56508708
2016_2_12 64.6844749451
2016_2_13 73.3836369514
2016_2_14 65.5510220528
2016_2_15 53.3509352207
2016_2_23 64.1533608437
2016_2_24 94.1156578064
2016_2_25 79.1427969933
2016_2_26 97.7849898338
2016_2_27 74.3128311634
2016_3_1 117.03241396
2016_3_2 140.763387918
2016_3_3 118.850497007
2016_3_4 115.089944124
2016_3_5 86.532407999
2016_3_6 88.6292390823
2016_3_7 63.697357893
2016_3_8 84.5285990238
2016_3_9 91.736084938
2016_3_10 76.6792151928
2016_3_11 21.3404188156


In [6]:
import pandas as pd

#verify counts by day
df = pd.read_csv('total_user_results.csv')
df

Unnamed: 0,year,month,day,hour,user,supports,frequency,mints
0,2011,9,2,13,nbcuniverso,,1,1314988208000
1,2011,11,16,5,nintendocapr,,1,1321438234000
2,2012,4,7,8,researchinpsych,,1,1333805974000
3,2012,11,6,14,realdonaldtrump,UserSupportsTrump_2015_10_11,1,1352229352000
4,2012,11,6,14,holytaco,,1,1352229456000
5,2013,2,6,22,lisagualtieri,,1,1360207592000
6,2013,2,8,20,benlowy,,1,1360374148000
7,2013,2,17,14,andrea_delgado7,,1,1361127785000
8,2013,2,27,14,andrea_delgado7,,1,1361991792000
9,2013,3,4,20,jordanems,,1,1362445434000


In [19]:
df2 = df.reset_index()
df2.fillna('none',inplace=True)


In [21]:
#convert "supports" column to just have the candidate name
df2['supports'] = df2['supports'].apply(lambda x: (x.split('_')[0]).replace('UserSupports',''))
df2

Unnamed: 0,index,year,month,day,hour,user,supports,frequency,mints
0,0,2011,9,2,13,nbcuniverso,none,1,1314988208000
1,1,2011,11,16,5,nintendocapr,none,1,1321438234000
2,2,2012,4,7,8,researchinpsych,none,1,1333805974000
3,3,2012,11,6,14,realdonaldtrump,Trump,1,1352229352000
4,4,2012,11,6,14,holytaco,none,1,1352229456000
5,5,2013,2,6,22,lisagualtieri,none,1,1360207592000
6,6,2013,2,8,20,benlowy,none,1,1360374148000
7,7,2013,2,17,14,andrea_delgado7,none,1,1361127785000
8,8,2013,2,27,14,andrea_delgado7,none,1,1361991792000
9,9,2013,3,4,20,jordanems,none,1,1362445434000


In [29]:
df3a = df2[['year','month','day','user','supports','frequency']]
df3 = df3a.groupby([df3a['year'],df3a['month'],df3a['day'],df3a['user'],df3a['supports']]).sum().reset_index()
df3

Unnamed: 0,year,month,day,user,supports,frequency
0,2006,3,21,rayreadyray,none,3
1,2006,3,30,adam,none,1
2,2006,3,30,dom,none,1
3,2006,3,30,noah,none,1
4,2006,3,30,rayreadyray,none,2
5,2006,3,30,timroberts,none,2
6,2006,6,17,nic,none,1
7,2006,6,17,sharon,none,1
8,2006,6,17,telene,none,1
9,2006,8,31,kmikeym,none,1


In [32]:
df3[(df3['supports']=='Trump') & (df3['year']==2015) & (df3['month']==10) & (df3['day']==15)]['frequency'].sum()

16

In [34]:
print df3['frequency'].sum(), df['frequency'].sum()

78092254 78092254


In [35]:
df3.to_csv('daily_users_supporters.csv')

In [None]:

# total = df2['frequency'].sum()
# print total

# with throwaway
# print total + 479103 #9172444