In [1]:
import pandas as pd 
import numpy as np
import string, re
import nltk
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC

from xgboost import XGBClassifier

from time import time
from pos_sentiment_scoring import Splitter
from pos_sentiment_scoring import POSTagger
from pos_sentiment_scoring import DictionaryTagger

%matplotlib inline



In [2]:
data = pd.read_csv('Data/tweets.txt', sep = ';~;', engine='python')
PositiveTweets = pd.read_csv('Data/tweetsPositive.txt', sep = ';~;', engine='python')
NegativeTweets = pd.read_csv('Data/tweetsNegative.txt', sep = ';~;', engine='python')

emoji_list = pd.read_csv('Data/emoji_table.txt', encoding='utf-8', index_col=0).index.values
SentimentEmoji = pd.read_csv('Data/Emoji_classification.csv', encoding='utf-8').dropna()
SentimentHashtags = pd.read_csv('Data/hashtags.csv', encoding='utf-8').dropna()

## The test set for hillary
hillaryTest = pd.read_csv('Hillary.csv')
TrumpTest = pd.read_csv('Trump.csv')

In [3]:
# List of positive and negative tweets
sad = [':‑(', ':(', ':‑c', ':c', ':‑<', ':<', ':‑[' ,':[', ':-||', '>:[', ':{', ':@', '>:(']
Positive = [':‑)',':)', ':-]', ':]',':-3', ':3', ':->', ':>' ,'8-)', '8)',':-}', ':}', ':o)', ':c)', ':^)' ,'=]', '=)'
           ,':‑D', ':D', '8‑D', '8D', 'x‑D', 'xD', 'X‑D', 'XD', '=D', '=3', 'B^D']
SentimentHashtags['HashtagSentiment'] = SentimentHashtags['HashtagSentiment'].map({'Positive':1, 'Negative':-1})
SentimentEmoji['Sentiment'] = SentimentEmoji['Sentiment'].map({'Positive':1, 'Negative':-1, 'Neutral':0}).dropna()
SentimentHashtags['Directed'] = SentimentHashtags['Directed'].map({'T':1, 'H':0})
hillaryTest.Sentiment = hillaryTest.Sentiment.map({'Positive':1, 'Negative':-1, 'Neutral':0})
TrumpTest.Sentiment = TrumpTest.Sentiment.map({'Positive':1, 'Negative':-1, 'Neutral':0})

In [1]:
stop_list = nltk.corpus.stopwords.words('english') + ["rt"] # rt - stands for retweet
lemmatizer = nltk.stem.WordNetLemmatizer()

# regex for capturing tweets
reg = '(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)'
emoticons = "|".join(map(re.escape, sad + Positive))

emoji_pattern = re.compile(u'('
    u'\ud83c[\udf00-\udfff]|'
    u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
    u'[\u2600-\u26FF\u2700-\u27BF])+', 
    re.UNICODE)
classifier =[]
def preprocess(tweet):
    # only processing if the the value is a string
    if type(tweet)!=type(2.0):
        tweet = tweet.decode('latin-1').encode("utf-8").decode('utf-8').strip()
        tweet = tweet.lower()
        # Removing hashtags
        tweet = " ".join(tweet.split('#'))
        # Removing URLs
        tweet = re.sub('((www\.[^\s]+)|(https://[^\s]+))','',tweet)
        tweet = re.sub('((pic\.[^\s]+)|(https://[^\s]+))','',tweet)
        tweet = re.sub("(http\S+)|(https\S+)", '', tweet)
        # Adding this pattern to the last cause it will remove everything after the start of a URL
        tweet = re.sub(u'[a-zA-Z0-9./]+\.[a-zA-Z0-9./ ]+.*$','',tweet)
        
        # Removing User mentions
        tweet = re.sub('@[^\s]+','',tweet)
        tweet = tweet.strip('\'"')
        # Removing stop words - This can be moved to count vectorization
        # tweet  = " ".join([word for word in tweet.split(" ") if word not in stop_list])
        # lemmatizing words 
        tweet = " ".join([lemmatizer.lemmatize(word) for word in tweet.split(" ")])
    else:
        tweet=''
    return tweet

def extractEmoticons(tweet):
    # emoji = emoji_pattern.findall(tweet)
    emoji = []
    for emo in emoji_list:
        if emo in tweet:
            emoji.append(emo)
    
    # these are :) :-) and other stuff
    emoticons = re.findall(reg, tweet)
    return " , ".join(emoji + emoticons)
def removeEmoticons(tweet):
    return re.sub(reg,'',tweet)

#Processing the tweets
data['processed_text'] = data.text.apply(preprocess)
print data['processed_text']
hillaryTest['processed_text'] = hillaryTest.processed_text.apply(preprocess)
TrumpTest['processed_text'] = TrumpTest.processed_text.apply(preprocess)
PositiveTweets['processed_text'] = PositiveTweets.text.apply(preprocess)
NegativeTweets['processed_text'] = NegativeTweets.text.apply(preprocess)


#getting the emoticons from the cleaned data
data['emoticons'] = data['processed_text'].apply(extractEmoticons)

# Removing emoticons from the text data
data['processed_text'] = data['processed_text'].apply(removeEmoticons)

#Applying sentence splitting, POS tagging, and Dictionary Tagging
splitter = Splitter()
postagger = POSTagger()
dicttagger = DictionaryTagger([ 'dicts/positive.yml', 'dicts/negative.yml', 
                                    'dicts/inc.yml', 'dicts/dec.yml', 'dicts/inv.yml'])

splitted_sentences = splitter.split(data['processed_text'])

pos_tagged_sentences = postagger.pos_tag(splitted_sentences)

dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)

# Sentiment score as a result of POS & Dictionary tagging
score = sentiment_score(dict_tagged_sentences)

data = data.append(PositiveTweets).append(NegativeTweets)

NameError: name 'nltk' is not defined

In [43]:
HillaryTweets = data[data['processed_text'].str.contains('((hil.?ary)|(clinton))', case = False)]
DonaldTweets = data[data['processed_text'].str.contains('trump', case = False)]

datasets = [HillaryTweets.copy(), DonaldTweets.copy()]
TrainSets = []
for i, dataset in enumerate(datasets):
    hashtags = datasets[i]['hashtags'].copy().str.split(' ').apply(pd.Series, 1).stack()
    hashtags.index = hashtags.index.droplevel(-1)
    datasets[i].drop('hashtags', axis=1, inplace=True)
    hashtags.name = 'hashtags'
    
    datasets[i] = datasets[i].join(hashtags.str.strip())
    
    emoticons = datasets[i]['emoticons'].copy().str.split(' ').apply(pd.Series, 1).stack()
    emoticons.index = emoticons.index.droplevel(-1)
    datasets[i].drop('emoticons', axis=1, inplace=True)
    emoticons.name = 'emoticons'
    datasets[i] = datasets[i].join(emoticons.str.strip())
    
    Directed_hashtags = SentimentHashtags[SentimentHashtags['Directed'] == i].copy() 
    Opp_hashtags = SentimentHashtags[SentimentHashtags['Directed'] != i].copy()
    Opp_hashtags.loc[: ,'HashtagSentiment'] = Opp_hashtags.HashtagSentiment * -1;
    Directed_hashtags = Directed_hashtags.append(Opp_hashtags)
    
    datasets[i] = pd.merge(datasets[i], Directed_hashtags, on = 'hashtags', how='outer')
    datasets[i] = pd.merge(datasets[i], SentimentEmoji, on = 'emoticons', how='outer')
    
    sentiments_num = [-1, 0, 1]
    HashtagsSentiments = []
    EmoticonsSentiments = []
    for senti in sentiments_num:
        temp_hashtag = datasets[i][datasets[i].HashtagSentiment == senti]
        temp_emoticon = datasets[i][datasets[i].Sentiment == senti]
        temp_hashtag = temp_hashtag[['processed_text','HashtagSentiment']].dropna().groupby(['processed_text']).count().reset_index()
        temp_hashtag.columns = ['processed_text','HashtagSentiment_'+str(senti)]
        temp_emoticon = temp_emoticon[['processed_text','Sentiment']].dropna().groupby(['processed_text']).count().reset_index()
        temp_emoticon.columns = ['processed_text','Sentiment_'+str(senti)]
        HashtagsSentiments.append(temp_hashtag)
        EmoticonsSentiments.append(temp_emoticon)
    datasets[i] = pd.DataFrame(datasets[i]['processed_text'].unique())
    datasets[i].columns = ['processed_text']
    
    for count in HashtagsSentiments:
        datasets[i] = pd.merge(datasets[i], count, on='processed_text', how = 'outer')
    for count in EmoticonsSentiments:
        datasets[i] = pd.merge(datasets[i], count, on='processed_text', how = 'outer')
    #TrainSets.append(datasets[i][['username', 'date', 'processed_text', 'Sentiment']].dropna().groupby(['processed_text', 'Sentiment']).max().reset_index())

  if __name__ == '__main__':


In [71]:
datasets[0] = datasets[0].fillna(0)
datasets[1] = datasets[1].fillna(0)
def calculate_sentiment(row):
    neg = row['HashtagSentiment_-1'] + row['Sentiment_-1']
    neu = row.HashtagSentiment_0 + row.Sentiment_0
    pos = row.HashtagSentiment_1 + row.Sentiment_1
    if((neg == neu) and (neu == pos) and (pos == 0.0)):
        return -2
    if(neg > neu and neg > pos):
        return -1
    elif (neu > neg and neu > pos):
        return 0
    elif (pos > neg and pos > neu):
        return 1
    else:
        return 3
datasets[0]['finalSentiment'] = datasets[0].apply(calculate_sentiment, axis = 1)
datasets[1]['finalSentiment'] = datasets[1].apply(calculate_sentiment, axis = 1)

## Things to remember about the test dataset:
1. They do not contain emoticons. We used all the tweets with emoticons for training as they were very less

## Things to try out
1. Use voting to find out the actual sentiment

In [73]:
print 'Prediction accuracy for Hillary'
data_train = datasets[0][~((datasets[0].finalSentiment == 3) | (datasets[0].finalSentiment == -2))][['processed_text','finalSentiment']].copy().dropna()
vectorizers = [TfidfVectorizer(stop_words=stop_list,ngram_range = (1,3)), TfidfVectorizer(stop_words=stop_list)]
vectorizersName = ['TF-IDF', 'TF-IDF with out ngram']
for k, vectorizer in enumerate(vectorizers):
    X = vectorizer.fit_transform(data_train.processed_text.append(hillaryTest.processed_text))
    X_train = X[0:data_train.processed_text.shape[0]]
    Y_train = data_train['finalSentiment']
    X_test = X[data_train.processed_text.shape[0]:]
    models = [RandomForestClassifier(),LinearSVC()]
    modelsName = ['RandomForest','Linear SVC']
    for m, model in enumerate(models):
        model.fit(X_train, Y_train)
        preds = model.predict(X_test.todense())
        score = 0
        tot = 0
        for i, pred in enumerate(preds):
            if(hillaryTest.Sentiment[i] == hillaryTest.Sentiment[i]):
                tot+=1
                if(hillaryTest.Sentiment[i] == pred):
                    score+=1

        print 'The score for Vectorizer:', vectorizersName[k],', Model:', modelsName[m], score, tot

The score for Vectorizer: TF-IDF , Model: RandomForest 44 72
The score for Vectorizer: TF-IDF , Model: Linear SVC 43 72
The score for Vectorizer: TF-IDF with out ngram , Model: RandomForest 43 72
The score for Vectorizer: TF-IDF with out ngram , Model: Linear SVC 44 72


In [75]:
print 'Prediction accuracy for Donald Trump'
data_train = datasets[1][~((datasets[1].finalSentiment == 3) | (datasets[1].finalSentiment == -2))][['processed_text','finalSentiment']].copy().dropna()
vectorizers = [TfidfVectorizer(stop_words=stop_list,ngram_range = (1,3)), TfidfVectorizer(stop_words=stop_list)]
vectorizersName = ['TF-IDF', 'TF-IDF with out ngram']
for k, vectorizer in enumerate(vectorizers):
    X = vectorizer.fit_transform(data_train.processed_text.append(TrumpTest.processed_text))
    X_train = X[0:data_train.processed_text.shape[0]]
    Y_train = data_train['finalSentiment']
    X_test = X[data_train.processed_text.shape[0]:]
    models = [RandomForestClassifier(),LinearSVC()]
    modelsName = ['RandomForest','Linear SVC']
    for m, model in enumerate(models):
        model.fit(X_train, Y_train)
        preds = model.predict(X_test.todense())
        score = 0
        tot = 0
        for i, pred in enumerate(preds):
            if(TrumpTest.Sentiment[i] == TrumpTest.Sentiment[i]):
                tot+=1
                if(TrumpTest.Sentiment[i] == pred):
                    score+=1

        print 'The score for Vectorizer:', vectorizersName[k],', Model:', modelsName[m], score, tot


The score for Vectorizer: TF-IDF , Model: RandomForest 40 68
The score for Vectorizer: TF-IDF , Model: Linear SVC 43 68
The score for Vectorizer: TF-IDF with out ngram , Model: RandomForest 40 68
The score for Vectorizer: TF-IDF with out ngram , Model: Linear SVC 47 68


In [14]:
import logging
from gensim.models import word2vec

def get_words(tweet):
    return tweet.split(' ')
tweets = pd.Series(data['processed_text'].unique()).apply(get_words)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 140    # Word vector dimensionality                      
min_word_count = 10   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)

print "Training model..."
model = word2vec.Word2Vec(tweets, workers=num_workers, size=num_features, min_count = min_word_count, window = context,
                          sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "30features_40minwords_10context"
model.save(model_name)

Training model...


In [35]:
lin_clf = LinearSVC()
lin_clf.fit(X_train, Y_train) 
preds = lin_clf.predict(X_test.todense())
score = 0
tot = 0
for i, pred in enumerate(preds):
    if(TrumpTest.Sentiment[i] == TrumpTest.Sentiment[i]):
        tot+=1
    if(TrumpTest.Sentiment[i] == pred):
        score+=1

print 'The score for Vectorizer: TF-IDF', 'Model: Linear SVC', score, tot

The score for Vectorizer: TF-IDF Model: Linear SVC 43 68


In [16]:
TrainSets[1].Sentiment.unique()

array([-1.,  1.,  0.,  2.])

In [30]:
Sentiments_num = [-1]

temp
for senti in Sentiments_num:
    temp = datasets[0][datasets[0].HashtagSentiment == senti]
    temp = temp[['processed_text','HashtagSentiment']].dropna().groupby(['processed_text']).count().reset_index()

In [70]:
datasets[0][~((datasets[0].finalSentiment == 3) | (datasets[0].finalSentiment == -2))]

Unnamed: 0,processed_text,HashtagSentiment_-1,HashtagSentiment_0,HashtagSentiment_1,Sentiment_-1,Sentiment_0,Sentiment_1,finalSentiment
21,donald trump open new line of attack on hillar...,7.0,0.0,0.0,0.0,0.0,0.0,-1
34,"in viral post, entire star trek cast begs amer...",1.0,0.0,0.0,0.0,0.0,0.0,-1
43,gary johnson sound more reasonable in these 5 ...,4.0,0.0,0.0,0.0,0.0,0.0,-1
51,the joker resembles more of donald trump than ...,8.0,0.0,0.0,0.0,0.0,0.0,-1
80,hillary ha nothing to run,1.0,0.0,0.0,0.0,0.0,0.0,-1
85,spread this now! hillary in panic mode video o...,0.0,0.0,6.0,0.0,0.0,0.0,1
87,"hillary clinton is 'nasty, but i can be nastie...",1.0,0.0,0.0,0.0,0.0,0.0,-1
93,i honestly thought it would take more than 90 ...,1.0,0.0,0.0,0.0,0.0,0.0,-1
98,"you're voting for a man who steal from vets, ...",0.0,0.0,6.0,0.0,0.0,0.0,1
109,"hillary is nasty, but trump promise he is nast...",1.0,0.0,0.0,0.0,0.0,0.0,-1
