In [1]:
import pandas as pd 
import numpy as np
import string, re
import nltk
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC

from xgboost import XGBClassifier

from time import time

%matplotlib inline



In [2]:
data = pd.read_csv('Data/tweets.txt', sep = ';~;', engine='python')
PositiveTweets = pd.read_csv('Data/tweetsPositive.txt', sep = ';~;', engine='python')
NegativeTweets = pd.read_csv('Data/tweetsNegative.txt', sep = ';~;', engine='python')

emoji_list = pd.read_csv('Data/emoji_table.txt', encoding='utf-8', index_col=0).index.values
SentimentEmoji = pd.read_csv('Data/Emoji_classification.csv', encoding='utf-8').dropna()
SentimentHashtags = pd.read_csv('Data/hashtags.csv', encoding='utf-8').dropna()

## The test set for hillary
hillaryTest = pd.read_csv('Hillary.csv')
TrumpTest = pd.read_csv('Trump.csv')

In [3]:
# List of positive and negative tweets
sad = [':‑(', ':(', ':‑c', ':c', ':‑<', ':<', ':‑[' ,':[', ':-||', '>:[', ':{', ':@', '>:(']
Positive = [':‑)',':)', ':-]', ':]',':-3', ':3', ':->', ':>' ,'8-)', '8)',':-}', ':}', ':o)', ':c)', ':^)' ,'=]', '=)'
           ,':‑D', ':D', '8‑D', '8D', 'x‑D', 'xD', 'X‑D', 'XD', '=D', '=3', 'B^D']
SentimentHashtags['HashtagSentiment'] = SentimentHashtags['HashtagSentiment'].map({'Positive':1, 'Negative':-1})
SentimentEmoji['Sentiment'] = SentimentEmoji['Sentiment'].map({'Positive':1, 'Negative':-1, 'Neutral':0}).dropna()
SentimentHashtags['Directed'] = SentimentHashtags['Directed'].map({'T':1, 'H':0})
hillaryTest.Sentiment = hillaryTest.Sentiment.map({'Positive':1, 'Negative':-1, 'Neutral':0})
TrumpTest.Sentiment = TrumpTest.Sentiment.map({'Positive':1, 'Negative':-1, 'Neutral':0})

In [4]:
stop_list = nltk.corpus.stopwords.words('english') + ["rt"] # rt - stands for retweet
lemmatizer = nltk.stem.WordNetLemmatizer()

# regex for capturing tweets
reg = '(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)'
emoticons = "|".join(map(re.escape, sad + Positive))

emoji_pattern = re.compile(u'('
    u'\ud83c[\udf00-\udfff]|'
    u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
    u'[\u2600-\u26FF\u2700-\u27BF])+', 
    re.UNICODE)
classifier =[]
def preprocess(tweet):
    # only processing if the the value is a string
    if type(tweet)!=type(2.0):
        tweet = tweet.decode('latin-1').encode("utf-8").decode('utf-8').strip()
        tweet = tweet.lower()
        # Removing hashtags
        tweet = " ".join(tweet.split('#'))
        # Removing URLs
        tweet = re.sub('((www\.[^\s]+)|(https://[^\s]+))','',tweet)
        tweet = re.sub('((pic\.[^\s]+)|(https://[^\s]+))','',tweet)
        tweet = re.sub("(http\S+)|(https\S+)", '', tweet)
        # Adding this pattern to the last cause it will remove everything after the start of a URL
        tweet = re.sub(u'[a-zA-Z0-9./]+\.[a-zA-Z0-9./ ]+.*$','',tweet)
        
        # Removing User mentions
        tweet = re.sub('@[^\s]+','',tweet)
        tweet = tweet.strip('\'"')
        # Removing stop words - This can be moved to count vectorization
        # tweet  = " ".join([word for word in tweet.split(" ") if word not in stop_list])
        # lemmatizing words 
        tweet = " ".join([lemmatizer.lemmatize(word) for word in tweet.split(" ")])
    else:
        tweet=''
    return tweet

def extractEmoticons(tweet):
    # emoji = emoji_pattern.findall(tweet)
    emoji = []
    for emo in emoji_list:
        if emo in tweet:
            emoji.append(emo)
    
    # these are :) :-) and other stuff
    emoticons = re.findall(reg, tweet)
    return " , ".join(emoji + emoticons)
def removeEmoticons(tweet):
    return re.sub(reg,'',tweet)

#Processing the tweets
data['processed_text'] = data.text.apply(preprocess)
hillaryTest['processed_text'] = hillaryTest.processed_text.apply(preprocess)
TrumpTest['processed_text'] = TrumpTest.processed_text.apply(preprocess)
PositiveTweets['processed_text'] = PositiveTweets.text.apply(preprocess)
NegativeTweets['processed_text'] = NegativeTweets.text.apply(preprocess)


#getting the emoticons from the cleaned data
data['emoticons'] = data['processed_text'].apply(extractEmoticons)

# Removing emoticons from the text data
data['processed_text'] = data['processed_text'].apply(removeEmoticons)

data = data.append(PositiveTweets).append(NegativeTweets)
print 'Completed'

Completed


In [5]:
HillaryTweets = data[data['processed_text'].str.contains('((hil.?ary)|(clinton))', case = False)]
DonaldTweets = data[data['processed_text'].str.contains('trump', case = False)]

datasets = [HillaryTweets.copy(), DonaldTweets.copy()]
TrainSets = []
for i, dataset in enumerate(datasets):
    hashtags = datasets[i]['hashtags'].copy().str.split(' ').apply(pd.Series, 1).stack()
    hashtags.index = hashtags.index.droplevel(-1)
    datasets[i].drop('hashtags', axis=1, inplace=True)
    hashtags.name = 'hashtags'
    
    datasets[i] = datasets[i].join(hashtags.str.strip())
    
    emoticons = datasets[i]['emoticons'].copy().str.split(' ').apply(pd.Series, 1).stack()
    emoticons.index = emoticons.index.droplevel(-1)
    datasets[i].drop('emoticons', axis=1, inplace=True)
    emoticons.name = 'emoticons'
    datasets[i] = datasets[i].join(emoticons.str.strip())
    
    Directed_hashtags = SentimentHashtags[SentimentHashtags['Directed'] == i].copy() 
    Opp_hashtags = SentimentHashtags[SentimentHashtags['Directed'] != i].copy()
    Opp_hashtags.loc[: ,'HashtagSentiment'] = Opp_hashtags.HashtagSentiment * -1;
    
    Directed_hashtags = Directed_hashtags.append(Opp_hashtags)
    datasets[i] = pd.merge(datasets[i], Directed_hashtags, on = 'hashtags', how='outer')
    datasets[i] = pd.merge(datasets[i], SentimentEmoji, on = 'emoticons', how='outer')
    datasets[i]['Sentiment'] = datasets[i]['HashtagSentiment'].add(datasets[i]['Sentiment'], fill_value = 0)
    TrainSets.append(datasets[i][['username', 'date', 'processed_text', 'Sentiment']].dropna().groupby(['processed_text', 'Sentiment']).max().reset_index())

  if __name__ == '__main__':


In [10]:
# Checking for HIllary
print pd.merge(TrainSets[1], TrumpTest, on = 'processed_text').shape
print pd.merge(TrainSets[0], hillaryTest, on = 'processed_text').shape

(2, 20)
(0, 20)


In [37]:
data_train = TrainSets[0][['processed_text','Sentiment']].copy().dropna()
vectorizers = [TfidfVectorizer(stop_words=stop_list,ngram_range = (1,3)), TfidfVectorizer(stop_words=stop_list)]
vectorizersName = ['TF-IDF', 'TF-IDF with out ngram']
for k, vectorizer in enumerate(vectorizers):
    X = vectorizer.fit_transform(data_train.processed_text.append(hillaryTest.processed_text))
    X_train = X[0:data_train.processed_text.shape[0]]
    Y_train = data_train['Sentiment']
    X_test = X[data_train.processed_text.shape[0]:]
    models = [RandomForestClassifier(),LinearSVC()]
    modelsName = ['RandomForest','Linear SVC']
    for m, model in enumerate(models):
        model.fit(X_train, Y_train)
        preds = model.predict(X_test.todense())
        score = 0
        tot = 0
        for i, pred in enumerate(preds):
            if(hillaryTest.Sentiment[i] == hillaryTest.Sentiment[i]):
                tot+=1
                if(hillaryTest.Sentiment[i] == pred):
                    score+=1

        print 'The score for Vectorizer:', vectorizersName[k],', Model:', modelsName[m], score, tot

The score for Vectorizer: TF-IDF , Model: RandomForest 38 72
The score for Vectorizer: TF-IDF , Model: Linear SVC 39 72
The score for Vectorizer: TF-IDF with out ngram , Model: RandomForest 38 72
The score for Vectorizer: TF-IDF with out ngram , Model: Linear SVC 41 72


In [47]:
data_train = TrainSets[1][['processed_text','Sentiment']].copy().dropna()
vectorizers = [TfidfVectorizer(stop_words=stop_list,ngram_range = (1,3)), TfidfVectorizer(stop_words=stop_list)]
vectorizersName = ['TF-IDF', 'TF-IDF with out ngram']
for k, vectorizer in enumerate(vectorizers):
    X = vectorizer.fit_transform(data_train.processed_text.append(TrumpTest.processed_text))
    X_train = X[0:data_train.processed_text.shape[0]]
    Y_train = data_train['Sentiment']
    X_test = X[data_train.processed_text.shape[0]:]
    models = [RandomForestClassifier(),LinearSVC()]
    modelsName = ['RandomForest','Linear SVC']
    for m, model in enumerate(models):
        model.fit(X_train, Y_train)
        preds = model.predict(X_test.todense())
        score = 0
        tot = 0
        for i, pred in enumerate(preds):
            if(TrumpTest.Sentiment[i] == TrumpTest.Sentiment[i]):
                tot+=1
                if(TrumpTest.Sentiment[i] == pred):
                    score+=1

        print 'The score for Vectorizer:', vectorizersName[k],', Model:', modelsName[m], score, tot


The score for Vectorizer: TF-IDF , Model: RandomForest 40 68
The score for Vectorizer: TF-IDF , Model: Linear SVC 43 68
The score for Vectorizer: TF-IDF with out ngram , Model: RandomForest 36 68
The score for Vectorizer: TF-IDF with out ngram , Model: Linear SVC 46 68


In [14]:
import logging
from gensim.models import word2vec

def get_words(tweet):
    return tweet.split(' ')
tweets = pd.Series(data['processed_text'].unique()).apply(get_words)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 140    # Word vector dimensionality                      
min_word_count = 10   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)

print "Training model..."
model = word2vec.Word2Vec(tweets, workers=num_workers, size=num_features, min_count = min_word_count, window = context,
                          sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "30features_40minwords_10context"
model.save(model_name)

Training model...


In [35]:
lin_clf = LinearSVC()
lin_clf.fit(X_train, Y_train) 
preds = lin_clf.predict(X_test.todense())
score = 0
tot = 0
for i, pred in enumerate(preds):
    if(TrumpTest.Sentiment[i] == TrumpTest.Sentiment[i]):
        tot+=1
    if(TrumpTest.Sentiment[i] == pred):
        score+=1

print 'The score for Vectorizer: TF-IDF', 'Model: Linear SVC', score, tot

The score for Vectorizer: TF-IDF Model: Linear SVC 43 68


In [46]:
HillaryTweets

Unnamed: 0,date,emoticons,favorites,geo,hashtags,id,mentions,permalink,processed_text,retweets,text,username
0,2016-10-01 15:51,,1216,,,"""782352194473459713""",,https://twitter.com/mitchellvii/status/7823521...,hillary attacked trump for (allegedly) calling...,878,"""Hillary attacked Trump for (allegedly) callin...",mitchellvii
3,2016-10-01 13:14,,415,,#BasementDwellers,"""782312789968814080""",,https://twitter.com/MikePenceVP/status/7823127...,"hillary called trump supporter ""deplorable"" an...",471,"""Hillary called Trump Supporters ""deplorable"" ...",MikePenceVP
5,2016-10-01 16:59,,0,,,"""782369482610122752""",,https://twitter.com/mykal57/status/78236948261...,social experiment: go to donald trump 's page ...,0,"""SOCIAL EXPERIMENT: Go to Donald Trump 's page...",mykal57
6,2016-10-01 16:59,,0,,,"""782369463198887936""",,https://twitter.com/monteromo08/status/7823694...,clinton's remark on young voter fuel new trump...,0,"""Clinton's remarks on young voters fuel new Tr...",monteromo08
7,2016-10-01 16:36,,842,,,"""782363604200849408""",,https://twitter.com/asamjulian/status/78236360...,"if trump get 15,000 to hillary 's 800 in the s...",609,"""If Trump gets 15,000 to Hillary 's 800 in the...",asamjulian
8,2016-10-01 16:59,,3,,,"""782369461810724864""",,https://twitter.com/JustNana620/status/7823694...,a daughterâs heartfelt letter on her republi...,3,"""A daughter’s heartfelt letter on her Republic...",JustNana620
10,2016-10-01 16:59,,23,,,"""782369446601953280""",,https://twitter.com/blainesearson34/status/782...,i don't want donald trump to win but at the sa...,7,"""I don't want Donald Trump to win but at the s...",blainesearson34
13,2016-10-01 16:59,,16,,,"""782369397419671552""",,https://twitter.com/halsteadg048/status/782369...,jesse watters on usa editorial board endorseme...,17,"""Jesse Watters on USA Editorial Board Endorsem...",halsteadg048
15,2016-10-01 16:59,,8,,,"""782369393829224448""",,https://twitter.com/Tom_Francois/status/782369...,"hillary : "" trump is pathetic! taking his fath...",10,"""Hillary : "" Trump is pathetic! Taking his fat...",Tom_Francois
16,2016-10-01 16:59,,3,"Country Club, FL",#ImWithHer,"""782369391568621569""",@PolitiFactWisc,https://twitter.com/kirk_boundy/status/7823693...,lying trump say hillary clinton 'gave up' one-...,5,"""Lying Trump says Hillary Clinton 'gave up' on...",kirk_boundy


In [10]:
data_train['testColumn'] = data_train.Sentiment

In [13]:
data_train = data_train.drop('testColumn', axis=1)