In [1]:
# Purpose:  Pull a random sample of tweets to use as final validation set for candidate-sentiment classifiers.
#           This will produce 10 per partition per candidate, for 400 per candidate, and 2000 total tweets. 
# Author:  Carol Sniegoski, adapted from Debbie Hofman
# Date:  May 18, 2016
# Course:  MAS DSE Capstone, Spring 2016

In [8]:
candidates = ['Trump','Clinton','Sanders','Cruz','Rubio']
print candidates

['Trump', 'Clinton', 'Sanders', 'Cruz', 'Rubio']


In [9]:
# Define queries for obtaining tweets that mention each candidate.
# The queries defined here get random samples. They do not exclude mentions of other candidates.

# This approach is very time-consuming. It may be doing a giant sort on everything.
# example:
# MATCH (t:Tweet_2016_3_1)
# WHERE EXISTS(t.mentions_Sanders) 
# WITH t, rand() as random
# RETURN t.text ORDER BY random LIMIT 10

# This approach is not entirely random, but it is quick, and sort of random.
# MATCH (t:Tweet_2016_3_1)
# WHERE EXISTS(t.mentions_Sanders) 
# WITH t.text as tweetText
# WHERE rand()<.0001
# RETURN tweetText LIMIT 10

queries = {}

queries['Sanders'] = """MATCH (t:Tweet_{0})
WHERE EXISTS(t.mentions_Sanders) 
WITH t.text as tweetText
WHERE rand()<.001
RETURN tweetText LIMIT 10
"""

queries['Clinton'] = """MATCH (t:Tweet_{0})
WHERE EXISTS(t.mentions_Clinton) 
WITH t.text as tweetText
WHERE rand()<.001
RETURN tweetText LIMIT 10
"""

queries['Trump'] = """MATCH (t:Tweet_{0})
WHERE EXISTS(t.mentions_Trump) 
WITH t.text as tweetText
WHERE rand()<.0001
RETURN tweetText LIMIT 10
"""

queries['Rubio'] = """MATCH (t:Tweet_{0})
WHERE EXISTS(t.mentions_Rubio) 
WITH t.text as tweetText
WHERE rand()<.001
RETURN tweetText LIMIT 10
"""

queries['Cruz'] = """MATCH (t:Tweet_{0})
WHERE EXISTS(t.mentions_Cruz) 
WITH t.text as tweetText
WHERE rand()<.001
RETURN tweetText LIMIT 10
"""

print 'done'

done


In [10]:
partitions = ['2015_10_11', '2015_10_12', '2015_10_13', '2015_10_14', '2015_10_15',\
              '2016_1_26', '2016_1_27', '2016_1_28', '2016_1_29', '2016_1_30', '2016_2_3', \
              '2016_2_4', '2016_2_5', '2016_2_6', '2016_2_7', '2016_2_8', '2016_2_9', '2016_2_10',\
              '2016_2_11', '2016_2_12', '2016_2_13', '2016_2_14', '2016_2_15', \
              '2016_2_23', '2016_2_24', '2016_2_25', '2016_2_26', '2016_3_1', '2016_3_2',\
             '2016_3_3', '2016_3_4', '2016_3_5', '2016_3_6', '2016_3_7', '2016_3_8', '2016_3_9',\
             '2016_3_10', '2016_3_11']

# partitions = ['2016_2_24', '2016_2_25', '2016_2_26', '2016_3_1', '2016_3_2',\
#              '2016_3_3', '2016_3_4', '2016_3_5', '2016_3_6', '2016_3_7', '2016_3_8', '2016_3_9',\
#              '2016_3_10', '2016_3_11']

print 'done'

done


In [11]:
from neo4j.v1 import GraphDatabase, basic_auth
import time
import csv

#driver = GraphDatabase.driver("bolt://sauce5.sdsc.edu", auth=basic_auth("neo4j", "lajolla"))
print 'opening driver...'
driver = GraphDatabase.driver("bolt://sauce5.sdsc.edu", auth=basic_auth("neo4j", "lajolla"))
#driver = GraphDatabase.driver("bolt://127.0.0.1")
print 'after opening driver...'

opening driver...
after opening driver...


In [12]:
# Locate data output directory
%ls ../data/candidatesentiment/

candidatesentiment_fromDebbie_05-14-16.csv  [34moutput[m[m/
candidatesentiment_fromDebbie_05-17-16.csv  sentimentsamples_random_05-18-16.csv


In [18]:
# Create output filename.
prefix = "../data/candidatesentiment/"
filename = "candidatesentiment_validationSet_random_05-19-16.csv"
outfile = prefix + filename
print outfile

../data/candidatesentiment/candidatesentiment_validationSet_random_05-19-16.csv


In [19]:
candidates = ['Trump', 'Clinton', 'Sanders', 'Rubio', 'Cruz']
print candidates

['Trump', 'Clinton', 'Sanders', 'Rubio', 'Cruz']


In [20]:
# Get data from neo4j and write to file.

#with open('sentimentsamples.csv', 'wb') as csvfile:
with open(outfile, 'wb') as csvfile:
    tweetwriter = csv.writer(csvfile)
    total_start = time.time()
    
    for partition in partitions:
    #for partition in ['2016_2_12']:

        print "PARTITION:", partition
        partition_start = time.time()
        
        for candidate in candidates:
            print
            print "CANDIDATE:", candidate

            start = time.time()

            query = queries[candidate].format(partition)
            print query

            session = driver.session()
            result = session.run(query)

            for record in result:
                #print partition, candidate, record[0]
                #tweetwriter.writerow([partition,candidate,'pos','supporter',record[0].encode("utf-8")])
                tweetwriter.writerow([partition, candidate, record[0].encode("utf-8")])

            session.close()

            end = time.time()
        print
        print "Time for partition " + partition + ": " + str(time.time() - partition_start)
    print "Total time:" + str(time.time() - total_start)
    
print 'done'    

PARTITION: 2015_10_11

CANDIDATE: Trump
MATCH (t:Tweet_2015_10_11)
WHERE EXISTS(t.mentions_Trump) 
WITH t.text as tweetText
WHERE rand()<.0001
RETURN tweetText LIMIT 10


CANDIDATE: Clinton
MATCH (t:Tweet_2015_10_11)
WHERE EXISTS(t.mentions_Clinton) 
WITH t.text as tweetText
WHERE rand()<.001
RETURN tweetText LIMIT 10


CANDIDATE: Sanders
MATCH (t:Tweet_2015_10_11)
WHERE EXISTS(t.mentions_Sanders) 
WITH t.text as tweetText
WHERE rand()<.001
RETURN tweetText LIMIT 10


CANDIDATE: Rubio
MATCH (t:Tweet_2015_10_11)
WHERE EXISTS(t.mentions_Rubio) 
WITH t.text as tweetText
WHERE rand()<.001
RETURN tweetText LIMIT 10


CANDIDATE: Cruz
MATCH (t:Tweet_2015_10_11)
WHERE EXISTS(t.mentions_Cruz) 
WITH t.text as tweetText
WHERE rand()<.001
RETURN tweetText LIMIT 10


Time for partition 2015_10_11: 2.93518304825
PARTITION: 2015_10_12

CANDIDATE: Trump
MATCH (t:Tweet_2015_10_12)
WHERE EXISTS(t.mentions_Trump) 
WITH t.text as tweetText
WHERE rand()<.0001
RETURN tweetText LIMIT 10


CANDIDATE: Clinton
