### This script groups in Neo4j by time period, what candidate a user supports in their tweets (tagged from a sentiment model), and topic.  Used in visualizations.

In [8]:
hourly_query="""
match (t:Tweet_{0})<-[:POSTS]-(u:User_{0}) \
where exists(t.mentions_Trump) \
or exists(t.mentions_Clinton) \
or exists(t.mentions_Sanders) \
or exists(t.mentions_Cruz) \
or exists(t.mentions_Rubio) \
or exists(t.topic)
WITH t.mentions_Trump as Trump, \
t.mentions_Clinton as Clinton, \
t.mentions_Sanders as Sanders, \
t.mentions_Cruz as Cruz, \
t.mentions_Rubio as Rubio, \
t.topic as topic, \
u.supports as supports, (toInt(t.timestamp_int) / 3600000) as hour_group, count(t) as frequency, \
MIN(t.timestamp_int) as mints \
RETURN supports, mints, Trump, Clinton, Sanders, Cruz, Rubio, \
topic, frequency"""

hourly_header=['year', 'month', 'day', 'hour', 'supports', 'mints',\
               'Trump','Clinton','Sanders','Cruz','Rubio',\
               'topic','frequency']
#hourly_header=['supports','mints','Trump','Clinton','Sanders','Cruz','Rubio','topic','frequency']

candidates = ['Trump','Clinton','Sanders','Cruz','Rubio']


In [17]:
partitions = ['2015_10_11', '2015_10_12', '2015_10_13', '2015_10_14', '2015_10_15',\
              '2016_1_26', '2016_1_27', '2016_1_28', '2016_1_29', '2016_1_30', '2016_2_2', '2016_2_3', \
              '2016_2_4', '2016_2_5', '2016_2_6', '2016_2_7', '2016_2_8', '2016_2_9', '2016_2_10',\
              '2016_2_11', '2016_2_12', '2016_2_13', '2016_2_14', '2016_2_15', \
              '2016_2_23', '2016_2_24', '2016_2_25', '2016_2_26', '2016_2_27', \
              '2016_3_1', '2016_3_2','2016_3_3', '2016_3_4', '2016_3_5', '2016_3_6', '2016_3_7', \
              '2016_3_8', '2016_3_9', '2016_3_10', '2016_3_11']
#partitions = ['2016_1_26']

In [18]:
from datetime import datetime

# Append the day and hour adjusted to EST (UTC-5)
# convert ms to s for python
def converttime(ts):
    ts_adjust = (ts/1000) - (3600*5)
    dt = datetime.utcfromtimestamp(ts_adjust)
    return dt.year, dt.month, dt.day, dt.hour

# When an item is retweeted, we keep a copy of the original tweet (based on retweet_status).
# This copy might be from prior to our day.  we need to remove this data from the final result.
def keepRetweetedItem(partition, ts):
    ts_adjust = ts / 1000
    dt = datetime.utcfromtimestamp(ts_adjust)
    year, month, day = partition.split('_')
    return ((int(year)==dt.year) and (int(month)==dt.month) and (int(day)==dt.day))

print converttime(1444607999000)
print keepRetweetedItem('2015_10_11',1444607999000)
print keepRetweetedItem('2015_10_11',1428188930000)



(2015, 10, 11, 18)
True
False


In [19]:
import time
import logging
import csv
from neo4jreader import neo4j_reader

#set up logging
logger = logging.getLogger('ResultsGroupedBySupports')
hdlr = logging.FileHandler('./supports_results_tagging.log')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr) 
logger.setLevel(logging.INFO)

reader = neo4j_reader('local',logger)

with open('hourly_supports_results.csv', 'wb') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(hourly_header)
    
    #throwaway=0
    
    for partition in partitions:

        start = time.time()

        formatted_query = hourly_query.format(partition)
        #print formatted_query

        for record in reader.runquery(formatted_query):
            outrecord = []
            for i in range(len(record)):
                outrecord.append(record[i])

            mints = int(outrecord[1])

            #if (keepRetweetedItem(partition, mints)):

            #some of the neutral items were labeled true instead of neu, correct these.
            #TODO: correct these in the database.
            for n in range(2,7):
                if ("true" in str(outrecord[n]).lower()):
                    outrecord[n]="neu"

            #convert time from UTC to EST and add day/hour information to output
            year, month, day, hour = converttime(mints)
            outrecord = [year, month, day, hour] + outrecord

            csvwriter.writerow(outrecord)  

#             else:
#                 throwaway = throwaway + 1


    print partition, time.time() - start
        
    #print throwaway


        

2016_3_11 3.66286706924


In [20]:
import pandas as pd

#verify counts by day
df = pd.read_csv('hourly_supports_results.csv')
df.head()

Unnamed: 0,year,month,day,hour,supports,mints,Trump,Clinton,Sanders,Cruz,Rubio,topic,frequency
0,2015,10,11,18,Cruz,1444607097494,neu,,,,neg,,1
1,2015,10,11,12,,1444583547000,,,neg,,,,2
2,2015,9,16,14,,1442433130000,neu,,,,,,1
3,2015,10,10,20,,1444528053000,neg,,,,,,1
4,2015,9,4,20,,1441416798000,neu,,,,,,1


In [24]:
df3a = df[['year', 'month', 'day', 'hour','supports', \
               'Trump','Clinton','Sanders','Cruz','Rubio',\
               'topic','frequency']]



In [21]:
df['frequency'].sum()

63019564

In [None]:
(df[(df['year']==2016) & (df['month']==1) & (df['day']==26)])

In [25]:
#Cleanup any values of "true"

df3a.replace('True','neu', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [26]:
df3 = df3a.reset_index()
df3.fillna('none',inplace=True)
df3

Unnamed: 0,index,year,month,day,hour,supports,Trump,Clinton,Sanders,Cruz,Rubio,topic,frequency
0,0,2015,10,11,18,Cruz,neu,none,none,none,neg,none,1
1,1,2015,10,11,12,none,none,none,neg,none,none,none,2
2,2,2015,9,16,14,none,neu,none,none,none,none,none,1
3,3,2015,10,10,20,none,neg,none,none,none,none,none,1
4,4,2015,9,4,20,none,neu,none,none,none,none,none,1
5,5,2015,10,9,14,Trump,neu,none,none,none,none,none,3
6,6,2015,10,11,17,Clinton,none,pos,neu,none,none,none,8
7,7,2015,10,11,18,none,none,none,none,none,none,jobs and economy,1
8,8,2015,10,10,10,none,none,none,neu,none,none,none,1
9,9,2015,10,11,17,Sanders,none,none,neg,none,none,none,30


In [27]:
df4 = df3.groupby([df3['year'],df3['month'],df3['day'],df3['supports'],df3['Trump'],\
                   df3['Clinton'],df3['Sanders'],\
            df3['Cruz'],df3['Rubio'],df3['topic']]).sum().reset_index()

df4.head()

Unnamed: 0,year,month,day,supports,Trump,Clinton,Sanders,Cruz,Rubio,topic,index,hour,frequency
0,2006,3,21,none,none,none,none,none,none,racial issues,1987198,18,1
1,2006,3,30,none,none,none,none,none,none,racial issues,1319888,17,1
2,2006,8,31,none,neu,none,none,none,none,none,1901866,19,1
3,2006,9,8,none,none,none,none,none,none,women's issues (not abortion though),1354736,21,1
4,2006,12,22,none,neu,none,none,none,none,none,12567845,133,7


In [28]:
df4 = df4.drop('index', 1)
df4 = df4.drop('hour', 1)
df4

Unnamed: 0,year,month,day,supports,Trump,Clinton,Sanders,Cruz,Rubio,topic,frequency
0,2006,3,21,none,none,none,none,none,none,racial issues,1
1,2006,3,30,none,none,none,none,none,none,racial issues,1
2,2006,8,31,none,neu,none,none,none,none,none,1
3,2006,9,8,none,none,none,none,none,none,women's issues (not abortion though),1
4,2006,12,22,none,neu,none,none,none,none,none,7
5,2007,5,12,none,neg,none,none,none,none,none,1
6,2007,8,24,none,neu,none,none,none,none,none,1
7,2008,1,9,none,none,neu,none,none,none,none,1
8,2008,1,10,none,none,neu,none,none,none,none,1
9,2008,2,9,Trump,none,none,none,none,none,racial issues,1


In [29]:
df4.to_csv('daily_supports_results_scrubbed.csv')

In [None]:

# total = df2['frequency'].sum()
# print total

# with throwaway
# print total + 479103 #9172444

In [30]:
df4['frequency'].sum()

63019564