In [438]:
import os
import re
from tf_rnn_classifier import TfRNNClassifier
import tweepy

from collections import Counter

from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from textblob import TextBlob



auth = tweepy.AppAuthHandler('aiA4EZjqeZ13it3Qvsgy9nHcv', 'J9sghg8mbgT7wF7vGSa6ZtvKpDaD9DdaCc1ljjTyF8QMNHK7Y8')
# auth.set_access_token('1003792857709232128-hyBfditXJWmvTLKiVS1laz3ofjP88B', 'j5WX4xTr9nc8iyGgIiPVM78xlCUVNfffPsKgr8TvKwihQ')
api = tweepy.API(auth, wait_on_rate_limit=True,wait_on_rate_limit_notify=True)

In [418]:
def create_training_data():
    num_tweets = 1000
    files = ['data/positive_tweets.txt', 'data/negative_tweets.txt']
    for file in files:
        query = ''
        if file == 'data/positive_tweets.txt': 
            query = ':) OR (: OR :-) OR <3 OR :D -:('
        else:
            query = ':( OR ): OR >:( -:)'
        query +=' -filter:retweets' # ignore retweets
        with open(file,'w') as f:
            for tweet in tweepy.Cursor(api.search,q=query,tweet_mode="extended").items(num_tweets):         
                if tweet.lang == 'en':
                    txt = tweet.full_text.replace('\n', '')
                    f.write(txt)
                    f.write('\n')

In [287]:
create_training_data()

In [511]:
class NBAData(object):
    def __init__(self):
        self.path = 'data/'       
    
    # Call GATE POS tagger to label words accordingly
    def tag_tweets(self, file):
        # files in the tagger expect to be referenced from within tagger folder      
        curr_path = os.getcwd()
        path = '{}{}'.format(curr_path, '/twitie-tagger')
        os.chdir(path)
        # change tweets_pos/new_rigs
        c = 'java -jar ./twitie_tag.jar ./models/gate-EN-twitter.model ../data/{} > ../data/tagged_{}'.format(file, file)
        os.system(c)
        os.chdir(curr_path)
        

    def bigrams_unigrams_phi(self, text):
        words = ['<S>'] + text.split() + ['</S>']
        bigrams = []
        unigrams = []
        for i in range(len(words) - 1):
            if i != 0:
                unigrams.append((words[i],)) 
            bigrams.append((words[i], words[i+1]))
        return Counter(unigrams + bigrams)       

    
    # Featurizes data into POS bigrams after data has been cleaned, stripped, etc    
    def transform_data(self, data, vectorizer=None):
        dicts = {}
        feat_matrix = None
        feat_dicts = []
        
        for i in range(len(data)):
            feat_dicts.append(self.bigrams_unigrams_phi(data[i]))
        
        vectorizer = DictVectorizer()    
        feat_matrix = vectorizer.fit_transform(feat_dicts)        
        
        return {'X': feat_matrix, 'vectorizer': vectorizer}

    # remove links, regularize capitalization, etc on all files passed in 
    def preprocess_data(self, file):
        pos_emoji_pattern = r'(:-?(?:\)+|D+))|((?:\(+)-?:<?)|(<3+)'
        neg_emoji_pattern = r'(>?:-?(?:\(+))|((?:D+|\)+)-?:<?)'
        lines = None 
        with open('{}{}'.format(self.path, file), 'r') as f:
            lines = f.readlines() # old contents with all the extra shit
        with open('{}{}'.format(self.path, file), 'w') as f:        
            for line in lines:
                new_line = re.sub(r'https?:\/\/.*[\r\n]*', '', line, flags=re.MULTILINE).lower()
                new_line = re.sub(pos_emoji_pattern, '', new_line)
                new_line = re.sub(neg_emoji_pattern, '', new_line)
                f.write(new_line)
                    
#     def get_labelled_data(self, file):
#         data = []
#         with open('{}{}'.format(self.path, dataset), 'r') as f:
#             for line in f.readlines():
#                 line = line.strip()
#                 idx = line.rfind(',')
#                 data.append((line[1:idx-1], int(line[idx+1:])))
#         return data
    
    def get_processed_data(self, file):
        data = []
        with open('{}{}'.format(self.path, file), 'r') as f:
            for line in f.readlines():
                line = line.strip()
                data.append(line)
        return data

    def label_data(self, process_data):
        pos_emoji_pattern = r'(:-?(?:\)+|D+))|((?:\(+)-?:<?)|(<3+)'
        neg_emoji_pattern = r'(>?:-?(?:\(+))|((?:D+|\)+)-?:<?)'
        data = []
        for entry in process_data:
            entry = re.sub(r'https?:\/\/.*[\r\n]*', '', entry, flags=re.MULTILINE)
            sentiment = 0
            if re.findall(pos_emoji_pattern, entry):
                sentiment += 1
                entry = re.sub(pos_emoji_pattern, '', entry)
            if re.findall(neg_emoji_pattern, entry):
                sentiment -= 1
                entry = re.sub(neg_emoji_pattern, '', entry)
            data.append((entry, sentiment))
        return data
            
    def predict_baseline(self, data, labels):
        correct = 0
        attempts = 0
        incorrect = [('tweet', 'exp_sentiment', 'sentiment')]
        for i in range(len(data)):
            tweet = data[i]
            sentiment = labels[i]
            exp_sentiment = self.get_tweet_sentiment_baseline(tweet)
            if exp_sentiment == sentiment:
                correct += 1
            else:
                incorrect.append((tweet, exp_sentiment, sentiment))
            attempts += 1
#         print ('{}% successful: {} correct, {} attempts'.format(correct*100//attempts, correct, attempts))
        print(float(correct)/attempts)
            
    def get_tweet_sentiment_baseline(self, tweet):
        analysis = TextBlob(self.clean_tweet(tweet))
        # set sentiment
        if analysis.sentiment.polarity > 0:
            return 1
        elif analysis.sentiment.polarity <= 0:
            return -1

    def predict(self, X, y, clf):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)        
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        print('Accuracy: %0.03f' % accuracy_score(y_test, predictions))
        print(classification_report(y_test, predictions, digits=3))
        misclassified_samples = X_test[y_test != predictions]
        print(misclassified_samples)
        return clf

    def clean_tweet(self, tweet):
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

    
    def get_vocab(self, data):
        wc = Counter([w for d in data for w in d.split()])
        wc = wc.items()
        vocab = {w for w, c in wc}
        vocab.add("$UNK")
        return sorted(vocab)
    
    
    
    

In [310]:
model = NBAData()
files = ['positive_tweets.txt', 'negative_tweets.txt']
for file in files: # probably want to go ahead and do this for all tweets about NBA players too... 
    model.preprocess_data(file)
    model.tag_tweets(file) # new files w/ GATE POS labels for all relevant tweets

In [420]:
# textblob shitty ass fuckin baseline you piece of shit rig 
model = NBAData()
pos_data = model.get_processed_data('positive_tweets.txt') # list of processed tweets
neg_data = model.get_processed_data('negative_tweets.txt')
pos_labels = [1] * len(pos_data)
neg_labels = [-1] * len(neg_data)

model.predict_baseline(pos_data + neg_data, pos_labels + neg_labels)




0.618348623853211


In [513]:
model = NBAData()

In [514]:
import numpy as np

tagged_files = ['tagged_positive_tweets.txt', 'tagged_negative_tweets.txt']
pos_data = model.get_processed_data('tagged_positive_tweets.txt') # list of processed tweets
neg_data = model.get_processed_data('tagged_negative_tweets.txt')

res = model.transform_data(pos_data + neg_data)
X = res['X']
pos_labels = [1] * len(pos_data)
neg_labels = [-1] * len(neg_data)
y = pos_labels + neg_labels
print("Number of training samples: {}".format(len(y)))

nb_clf = model.predict(X, y, MultinomialNB(alpha=0.5))
svc_clf = model.predict(X, y, LinearSVC())



# tf_rnn = TfRNNClassifier(
#     model.get_vocab(pos_data + neg_data),
#     embed_dim=50,
#     hidden_dim=50,
#     max_length=200, # idk how long a tweet is
#     hidden_activation=tf.nn.tanh,
#     cell_class=tf.nn.rnn_cell.LSTMCell,
#     train_embedding=True,
#     max_iter=100,
#     eta=0.05) 

# rnn_X = [[w for w in d.split()] for d in pos_data + neg_data]
# X_train, X_test, y_train, y_test = train_test_split(rnn_X, y, test_size=0.2, random_state=42) 

# tf_rnn.fit(X_train, y_train)
# tf_rnn_dev_predictions = tf_rnn.predict(X_test)
# print(classification_report(y_test, tf_rnn_dev_predictions))

# print(accuracy_score(y_test, tf_rnn_dev_predictions))

Number of training samples: 545
Accuracy: 0.661
             precision    recall  f1-score   support

         -1      0.565     0.848     0.678        46
          1      0.825     0.524     0.641        63

avg / total      0.715     0.661     0.657       109

  (0, 398)	1.0
  (0, 399)	1.0
  (0, 539)	1.0
  (0, 1795)	1.0
  (0, 1796)	1.0
  (0, 2322)	1.0
  (0, 2323)	1.0
  (0, 2484)	1.0
  (0, 2486)	1.0
  (0, 2590)	1.0
  (0, 2591)	1.0
  (0, 2923)	1.0
  (0, 2934)	1.0
  (0, 3152)	2.0
  (0, 3154)	1.0
  (0, 3155)	1.0
  (0, 3206)	1.0
  (0, 3215)	1.0
  (0, 3395)	1.0
  (0, 3397)	1.0
  (0, 3973)	1.0
  (0, 3974)	1.0
  (0, 4045)	1.0
  (0, 4046)	1.0
  (0, 4870)	1.0
  :	:
  (35, 4514)	1.0
  (35, 9495)	1.0
  (35, 9496)	1.0
  (35, 11848)	1.0
  (35, 11851)	1.0
  (36, 950)	1.0
  (36, 2375)	1.0
  (36, 2378)	1.0
  (36, 2613)	1.0
  (36, 2639)	1.0
  (36, 3504)	1.0
  (36, 3539)	1.0
  (36, 4894)	1.0
  (36, 4895)	1.0
  (36, 6982)	1.0
  (36, 6983)	1.0
  (36, 7864)	1.0
  (36, 7865)	1.0
  (36, 9409)	1.0
  (36, 946