In [None]:
import requests
import oauth2
import json
import pickle 
import os

with open("twitter_auth.json", 'r') as f:
    auth_codes = json.load(f)
    
def load_pickle(fname):
    with open(fname, 'rb') as f:
        data = pickle.load(f)
    return data
    
def add_to_pickle_file(fname, new_data):
    # Given a pickle filename, append to it the new data (in our case, a list of dicts)
   
    # Check that file is not empty, otherwise initialize
    if os.path.getsize(fname) > 0:      
        with open(fname, 'rb') as f:
            unpickler = pickle.Unpickler(f);
            data = unpickler.load()
    else:
        data = []
    
    print("Old length of file: {:d}".format(len(data)))
    data = data + new_data
    print("New length of file: {:d}".format(len(data)))
    
    with open(fname, 'wb') as f:
        pickle.dump(data, f)

def read_text(filename, comment=";"):
    """ Given a filename, read in every line that is not commented"""
    f = open(filename, "r")
    out_list = []
    
    for line in f:
        if line[0] != comment:
            out_list.append(line.strip('\n'))
    
    f.close()
    
    return out_list

pos_vocab = read_text("positive-words.txt")
neg_vocab = pos_vocab#read_text("negative-words.txt")

def read_words(filename):
    out = []
    with open(filename, 'r') as f:
        for line in f:
            out.append(line.strip('\n'))
    return out

english_words = read_words("words_alpha.txt")


pickle_file = "ben_labels.pckl"
# pickle_file = "patrick_labels.pckl"

In [None]:
## Fetch Tweets ##
def oauth_req(url, http_method="GET", post_body=b"", http_headers=None):
    consumer = oauth2.Consumer(key=auth_codes["key"], secret=auth_codes["secret"])
    token = oauth2.Token(key="", secret="")
    client = oauth2.Client(consumer, token)
    resp, content = client.request( url, method=http_method, body=post_body, headers=http_headers )
    return content

byteResponse = oauth_req('https://api.twitter.com/1.1/search/tweets.json?q=a&lang=en&count=100&tweet_mode=extended', "GET")
tweets = json.loads(byteResponse)

In [None]:
## Label Tweets ##

# Usage: 
# 0) Make empty file called "patrick_labels.pckl" in working directory
# 1) Select correct pickle file
# 2) Label tweets as appropriate
# 3) When finished, type "done" into the emotion box to automatically have tweets uploaded to the pickle file. 

labeledTweets = []

print("Total number of tweets to analyze in this session: {:d}".format(len(tweets['statuses'])))

# p - positive, n - negative, i - informational, h - humor, a - advertisement, o - none of the above
for status in tweets['statuses']:
    text = status['full_text']
    # If this is a retweet, the full_text field gets cut off at 140 characters, 
    # so we have to get the whole text from the retweeted_status field
    if text[0:4] == 'RT @':
        text = status['retweeted_status']['full_text']
    print(text)
    emotion = input("What emotion is this? (type 'done' to save current progress to file) ")
    print("\n")
    if emotion.lower() == "done":
        break
    tweet = {"status": text, "userId": status['user']['id'], "emotion": emotion}
    labeledTweets.append(tweet)

add_to_pickle_file(pickle_file, labeledTweets)
print("Added {:d} datapoints".format(len(labeledTweets)))

In [None]:
## transfer txt file to pickle file ##
import ast

f = open("patrickClassified.txt",'r')
lines = f.readlines()
f.close()

dictString = ""
inStatus = False
tweets = []
for i, line in enumerate(lines):
    if "\'emotion\'" in line:
        if "[" in line:
            ind = line.index("[") + 1
            line = line[ind:]
        dictString = dictString + line
    elif "\'status\'" in line:
        inStatus = True
        dictString = dictString + line
    elif "\'userId\'" in line:
        inStatus = False
        if "," in line:
            ind = line.index(",")
            line = line[:ind]
        elif "]" in line:
            ind = line.index("]")
            line = line[:ind]
        dictString = dictString + line
        tweet = ast.literal_eval(dictString.lstrip())
        tweets.append(tweet)
        dictString = ""
    elif inStatus:
        dictString = dictString + line

#add_to_pickle_file("patrick_labels.pckl", tweets)
        

In [None]:
## Experimenting to see what features will work best ##

import matplotlib.pyplot as plt
import numpy as np

# Feature ideas: length of status, presence of url, number of @'s, number of #'s, presence/number of emojis, 
# number of numeric/special characters as opposed to alpha, wordnet to assess sentiment, number of alpha words that
# are not real dictionary words, 'not' analysis?

## Definitions ##
def num_pos(status, pos_words=pos_vocab):
    from nltk import word_tokenize
    
    p = 0
    
    for w in word_tokenize(status):
        if w in pos_words:
            p = p + 1 
            
    return p

def num_neg(status, neg_words=neg_vocab):
    from nltk import word_tokenize
    
    n = 0
    for w in word_tokenize(status):
        if w in neg_vocab:
            n = n + 1 
            
    return n

def count_emojis(status):
    import regex as re
    
    emoji_regex = r'[\uD83C-\uDBFF\uDC00-\uDFFF]'
    text_emoji_regex = r'[:;=][ -*]*?[\[\]\(\)DPpXx3/\\]' # Included some common regex expressions. '...*?...' 
    text_reverse_emoji_regex = r'[\[\]\(\)DPpxX/\\][ -*]*?[:;=]' # Included some common reverse regex expressions. 
    total_regex = r'(' + emoji_regex + r'|' + text_emoji_regex + r'|' + text_reverse_emoji_regex + r')'
    rexp = re.compile(total_regex)
    num_emojis = len(rexp.findall(status))
    
    return num_emojis

def statusLength(status):
    return len(status)

def hasLink(status):
    if "https://" in status:
        return 1
    else:
        return 0
    
def numAts(status):
    return status.count("@")

def numHashtags(status):
    return status.count("#")

features = [statusLength, hasLink, numAts, numHashtags]

#Define data
patrickData = load_pickle("patrick_labels.pckl")
patrickDatas = [patrickData[:270], patrickData[270:540], patrickData[540:]]
numGroups = len(patrickDatas)
#benData = load_pickle("ben_labels.pckl")

#Define categories
allCategories = ["p","n","h","a","i","o"]


## Extract feature information from data ##
featureDatas = []
for i in range(0, numGroups):
    allStatuses = {}
    for category in allCategories:
        allStatuses[category] = []
    for tweet in patrickDatas[i]:
        categories = tweet['emotion']
        for category in categories:
            allStatuses[category].append(tweet['status'])
    
    featureData = {}

    for category in allCategories:
        statuses = allStatuses[category]
        numStatuses = len(statuses)
        totals = {}
        avgs = {}
        #Initialize and calculate totals for each feature
        for feature in features:
            totals[feature] = 0
        for status in statuses:
            for feature in features:
                totals[feature] += feature(status)
        #Compute and storeaverages for each feature
        for feature in features:
            avgs[feature] = totals[feature]/numStatuses
        featureData[category] = avgs
        #for feature in features:
            #print(totals[feature]/numStatuses)
    featureDatas.append(featureData)

    
    
## Plots to show results ##
colors = ['b','r','g', 'c', 'm', 'k']
for feature in features:
    x_pos = np.arange(len(allCategories))
    w = 0.9 / numGroups
    ax = plt.subplot(111)
    
    #Total
    y = []
    for category in allCategories:
        totalAvg = 0
        for i in range(0, numGroups):
            totalAvg += featureDatas[i][category][feature]/numGroups
        y.append(totalAvg)
    ax.bar([pos + w*(numGroups - 1)/2 for pos in x_pos], y,width=0.9,color='y',align='center')
    
    #For each group
    for i in range(0, numGroups):
        y = []
        for category in allCategories:
            y.append(featureDatas[i][category][feature])
        ax.bar([pos + w*i for pos in x_pos], y,width=w,color=colors[i],align='center')

    plt.xticks(x_pos, allCategories)
    plt.title(feature.__name__)
    plt.show()            totals[feature] += feature(status)
    for feature in features:
        print(totals[feature]/numStatuses)