In [1]:
from collections import Counter
import nltk
import pandas as pd
import re as regex
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, 
from time import time
import gensim


In [3]:
class TwitterData_Initialize():
    data = []
    processed_data = []
    wordlist = []

    data_model = None
    data_labels = None
    is_testing = False
    
    def initialize(self, csv_file, is_testing_set=False, from_cached=None):
        if from_cached is not None:
            self.data_model = pd.read_csv(from_cached)
            return

        self.is_testing = is_testing_set

        if not is_testing_set:
            self.data = pd.read_csv(csv_file, header=0, names=["id", "emotion", "text"])
            self.data = self.data[self.data["emotion"].isin(["positive", "negative", "neutral"])]
        else:
            self.data = pd.read_csv(csv_file, header=0, names=["id", "text"],dtype={"id":"int64","text":"str"},nrows=4000)
            not_null_text = 1 ^ pd.isnull(self.data["text"])
            not_null_id = 1 ^ pd.isnull(self.data["id"])
            self.data = self.data.loc[not_null_id & not_null_text, :]

        self.processed_data = self.data
        self.wordlist = []
        self.data_model = None
        self.data_labels = None

In [4]:
data = TwitterData_Initialize()
data.initialize("kaggletrain.csv")
data.processed_data.head(5)

Unnamed: 0,id,emotion,text
0,635769805279248384,negative,Not Available
1,635930169241374720,neutral,IOS 9 App Transport Security. Mm need to check...
2,635950258682523648,neutral,"Mar if you have an iOS device, you should down..."
3,636030803433009153,negative,@jimmie_vanagon my phone does not run on lates...
4,636100906224848896,positive,Not sure how to start your publication on iOS?...


## Data distribution
First thing that can be done as soon as the data is loaded is to see the data distribution. The training set had the following distribution:


In [5]:
df = data.processed_data
neg = len(df[df["emotion"] == "negative"])
pos = len(df[df["emotion"] == "positive"])
neu = len(df[df["emotion"] == "neutral"])
dist = [
    graph_objs.Bar(
        x=["negative","neutral","positive"],
        y=[neg, neu, pos],
)]
plotly.offline.iplot({"data":dist, "layout":graph_objs.Layout(title="Sentiment type distribution in training set")})

In [None]:
## Preprocessing steps
The target of the following preprocessing is to create a **Bag-of-Words** representation of the data. The steps will execute as follows:
1. Cleansing
<ol style="list-style-type:decimal"><li>Remove URLs</li>
<li>Remove usernames (mentions)</li>
<li>Remove tweets with *Not Available* text</li>
<li>Remove special characters</li>
<li>Remove numbers</li></ol>
1. Text processing
<ol style="list-style-type:decimal">
<li>Tokenize</li>
<li>Transform to lowercase</li>
<li>Stem</li></ol>
1. Build word list for Bag-of-Words

### Cleansing
For the purpose of cleansing, i created ```TwitterCleanup``` . It consists methods allowing to execute all of the tasks show in the list above. Most of those is done using regular expressions.
The class exposes it's interface through ```iterate()``` method - it yields every cleanup method in proper order.

In [6]:
class TwitterCleanuper:
    def iterate(self):
        for cleanup_method in [self.remove_urls,
                               self.remove_usernames,
                               self.remove_na,
                               self.remove_special_chars,
                               self.remove_numbers]:
            yield cleanup_method

    @staticmethod
    def remove_by_regex(tweets, regexp):
        tweets.loc[:, "text"].replace(regexp, "", inplace=True)
        return tweets

    def remove_urls(self, tweets):
        return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"http.?://[^\s]+[\s]?"))

    def remove_na(self, tweets):
        return tweets[tweets["text"] != "Not Available"]

    def remove_special_chars(self, tweets):  # it unrolls the hashtags to normal words
        for remove in map(lambda r: regex.compile(regex.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$",
                                                                     "@", "%", "^", "*", "(", ")", "{", "}",
                                                                     "[", "]", "|", "/", "\\", ">", "<", "-",
                                                                     "!", "?", ".", "'",
                                                                     "--", "---", "#"]):
            tweets.loc[:, "text"].replace(remove, "", inplace=True)
        return tweets

    def remove_usernames(self, tweets):
        return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"@[^\s]+[\s]?"))

    def remove_numbers(self, tweets):
        return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"\s?[0-9]+\.?[0-9]*"))

The loaded tweets can be now cleaned. 

In [7]:
class TwitterData_Cleansing(TwitterData_Initialize):
    def __init__(self, previous):
        self.processed_data = previous.processed_data
        
    def cleanup(self, cleanuper):
        t = self.processed_data
        for cleanup_method in cleanuper.iterate():
            if not self.is_testing:
                t = cleanup_method(t)
            else:
                if cleanup_method.__name__ != "remove_na":
                    t = cleanup_method(t)

        self.processed_data = t

# My first_test

In [8]:
data = TwitterData_Cleansing(data)
data.cleanup(TwitterCleanuper())
data.processed_data.head(5)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Unnamed: 0,id,emotion,text
1,635930169241374720,neutral,IOS App Transport Security Mm need to check if...
2,635950258682523648,neutral,Mar if you have an iOS device you should downl...
3,636030803433009153,negative,my phone does not run on latest IOS which may ...
4,636100906224848896,positive,Not sure how to start your publication on iOS ...
5,636176272947744772,neutral,Two Dollar Tuesday is here with Forklift Quick...


### Tokenization & stemming
For the text processing, ```nltk``` library is used. First, the tweets are tokenized using ```nlkt.word_tokenize``` and then, stemming is done using **PorterStemmer** 


In [9]:
class TwitterData_TokenStem(TwitterData_Cleansing):
    def __init__(self, previous):
        self.processed_data = previous.processed_data
        
    def stem(self, stemmer=nltk.PorterStemmer()):
        def stem_and_join(row):
            row["text"] = list(map(lambda str: stemmer.stem(str.lower()), row["text"]))
            return row

        self.processed_data = self.processed_data.apply(stem_and_join, axis=1)

    def tokenize(self, tokenizer=nltk.word_tokenize):
        def tokenize_row(row):
            row["text"] = tokenizer(row["text"])
            row["tokenized_text"] = [] + row["text"]
            return row

        self.processed_data = self.processed_data.apply(tokenize_row, axis=1)


In [10]:
data = TwitterData_TokenStem(data)
data.tokenize()
data.stem()
data.processed_data.head(5)

Unnamed: 0,id,emotion,text,tokenized_text
1,635930169241374720,neutral,"[io, app, transport, secur, mm, need, to, chec...","[IOS, App, Transport, Security, Mm, need, to, ..."
2,635950258682523648,neutral,"[mar, if, you, have, an, io, devic, you, shoul...","[Mar, if, you, have, an, iOS, device, you, sho..."
3,636030803433009153,negative,"[my, phone, doe, not, run, on, latest, io, whi...","[my, phone, does, not, run, on, latest, IOS, w..."
4,636100906224848896,positive,"[not, sure, how, to, start, your, public, on, ...","[Not, sure, how, to, start, your, publication,..."
5,636176272947744772,neutral,"[two, dollar, tuesday, is, here, with, forklif...","[Two, Dollar, Tuesday, is, here, with, Forklif..."


# Building the wordlist


In [11]:
words = Counter()
for idx in data.processed_data.index:
    words.update(data.processed_data.loc[idx, "text"])

words.most_common(5)

[('the', 3744), ('to', 2477), ('i', 1667), ('a', 1620), ('on', 1557)]

The most commont words (as expected) are the typical english stopwords. We will filter them out, however, as purpose of this analysis is to determine sentiment, words like "not" and "n't" can influence it greatly. Having this in mind, this word will be whitelisted.


In [12]:
stopwords=nltk.corpus.stopwords.words("english")
whitelist = ["n't", "not"]
for idx, stop_word in enumerate(stopwords):
    if stop_word not in whitelist:
        del words[stop_word]
words.most_common(5)

[('may', 1027), ('tomorrow', 764), ('day', 526), ('go', 499), ('thi', 495)]

Still, there are some words that seem too be occuring to many times, i  filter them. After some analysis, the lower bound was set to 3.

The wordlist is also saved to the csv file, so the same words can be used for the testing set.


In [13]:
class TwitterData_Wordlist(TwitterData_TokenStem):
    def __init__(self, previous):
        self.processed_data = previous.processed_data
        
    whitelist = ["n't","not"]
    wordlist = []
        
    def build_wordlist(self, min_occurrences=3, max_occurences=500, stopwords=nltk.corpus.stopwords.words("english"),
                       whitelist=None):
        self.wordlist = []
        whitelist = self.whitelist if whitelist is None else whitelist
        import os
        if os.path.isfile("data\\wordlist.csv"):
            word_df = pd.read_csv("data\\wordlist.csv")
            word_df = word_df[word_df["occurrences"] > min_occurrences]
            self.wordlist = list(word_df.loc[:, "word"])
            return

        words = Counter()
        for idx in self.processed_data.index:
            words.update(self.processed_data.loc[idx, "text"])

        for idx, stop_word in enumerate(stopwords):
            if stop_word not in whitelist:
                del words[stop_word]

        word_df = pd.DataFrame(data={"word": [k for k, v in words.most_common() if min_occurrences < v < max_occurences],
                                     "occurrences": [v for k, v in words.most_common() if min_occurrences < v < max_occurences]},
                               columns=["word", "occurrences"])

        word_df.to_csv("data\\wordlist.csv", index_label="idx")
        self.wordlist = [k for k, v in words.most_common() if min_occurrences < v < max_occurences]


In [14]:
data = TwitterData_Wordlist(data)
data.build_wordlist()

In [15]:
words = pd.read_csv("data\\wordlist.csv")
x_words = list(words.loc[0:10,"word"])
x_words.reverse()
y_occ = list(words.loc[0:10,"occurrences"])
y_occ.reverse()

dist = [
    graph_objs.Bar(
        x=y_occ,
        y=x_words,
        orientation="h"
)]
plotly.offline.iplot({"data":dist, "layout":graph_objs.Layout(title="Top words in built wordlist")})

### Bag-of-words
The data is ready to transform it to bag-of-words representation.


In [16]:
class TwitterData_BagOfWords(TwitterData_Wordlist):
    def __init__(self, previous):
        self.processed_data = previous.processed_data
        self.wordlist = previous.wordlist
    
    def build_data_model(self):
        label_column = []
        if not self.is_testing:
            label_column = ["label"]

        columns = label_column + list(
            map(lambda w: w + "_bow",self.wordlist))
        labels = []
        rows = []
        for idx in self.processed_data.index:
            current_row = []

            if not self.is_testing:
                # add label
                current_label = self.processed_data.loc[idx, "emotion"]
                labels.append(current_label)
                current_row.append(current_label)

            # add bag-of-words
            tokens = set(self.processed_data.loc[idx, "text"])
            for _, word in enumerate(self.wordlist):
                current_row.append(1 if word in tokens else 0)

            rows.append(current_row)

        self.data_model = pd.DataFrame(rows, columns=columns)
        self.data_labels = pd.Series(labels)
        return self.data_model, self.data_labels

In [17]:
data = TwitterData_BagOfWords(data)
bow, labels = data.build_data_model()
bow.head(5)

Unnamed: 0,label,go_bow,thi_bow,wa_bow,not_bow,im_bow,see_bow,time_bow,get_bow,like_bow,...,topless_bow,flop_bow,scari_bow,attract_bow,pr_bow,sne_bow,harder_bow,sole_bow,rafe_bow,nc_bow
0,neutral,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,neutral,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,negative,0,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,positive,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,neutral,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
grouped = bow.groupby(["label"]).sum()
words_to_visualize = []
sentiments = ["positive","negative","neutral"]
#get the most 7 common words for every sentiment
for sentiment in sentiments:
    words = grouped.loc[sentiment,:]
    words.sort_values(inplace=True,ascending=False)
    for w in words.index[:7]:
        if w not in words_to_visualize:
            words_to_visualize.append(w)
            
            
#visualize it
plot_data = []
for sentiment in sentiments:
    plot_data.append(graph_objs.Bar(
            x = [w.split("_")[0] for w in words_to_visualize],
            y = [grouped.loc[sentiment,w] for w in words_to_visualize],
            name = sentiment
    ))
    
plotly.offline.iplot({
        "data":plot_data,
        "layout":graph_objs.Layout(title="Most common words across sentiments")
    })
    


# Classification


The following utility function will train the classifier and show the F1, precision, recall and accuracy scores.

In [20]:
def test_classifier(X_train, y_train, X_test, y_test, classifier):
    log("")
    log("===============================================")
    classifier_name = str(type(classifier).__name__)
    log("Testing " + classifier_name)
    now = time()
    list_of_labels = sorted(list(set(y_train)))
    model = classifier.fit(X_train, y_train)
    log("Learing time {0}s".format(time() - now))
    now = time()
    predictions = model.predict(X_test)
    log("Predicting time {0}s".format(time() - now))

    precision = precision_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    recall = recall_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    log("=================== Results ===================")
    log("            Negative     Neutral     Positive")
    log("F1       " + str(f1))
    log("Precision" + str(precision))
    log("Recall   " + str(recall))
    log("Accuracy " + str(accuracy))
    log("===============================================")

    return precision, recall, accuracy, f1

def log(x):
    #can be used to write to log file
    print(x)

## Experiment 1: BOW + Naive Bayes
 



here i used "8 fold validation" 

In [23]:
nb_acc = cv(BernoulliNB(), bow.iloc[:,1:], bow.iloc[:,0])

Crossvalidating BernoulliNB...
Crosvalidation completed in 4.4375975131988525s
Accuracy: [ 0.54639175  0.48820059  0.28023599  0.31415929  0.32743363  0.50073855
  0.47119645  0.53106509]
Average accuracy: 0.432427668406


In [26]:
data = TwitterData_ExtraFeatures()
data.initialize("data\\train.csv")
data.build_features()
data.cleanup(TwitterCleanuper())
data.tokenize()
data.stem()
data.build_wordlist()
data_model, labels = data.build_data_model()
data_model.head(5)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Unnamed: 0,label,number_of_uppercase,number_of_exclamation,number_of_question,number_of_ellipsis,number_of_hashtags,number_of_mentions,number_of_quotes,number_of_urls,number_of_positive_emo,...,topless_bow,flop_bow,scari_bow,attract_bow,pr_bow,sne_bow,harder_bow,sole_bow,rafe_bow,nc_bow
0,neutral,2,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,neutral,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,negative,2,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,positive,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,neutral,4,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Experiment 2: extended features + Random Forest
As a second attempt on the classification the i used **Random Forest**

In [28]:
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(data_model.iloc[:, 1:], data_model.iloc[:, 0],
                                                    train_size=0.7, stratify=data_model.iloc[:, 0],
                                                    random_state=seed)
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, RandomForestClassifier(random_state=seed,n_estimators=403,n_jobs=-1))


Testing RandomForestClassifier
Learing time 6.9144287109375s
Predicting time 0.21802711486816406s
            Negative     Neutral     Positive
F1       [ 0.24501425  0.47944007  0.70340909]
Precision[ 0.47777778  0.49192101  0.63163265]
Recall   [ 0.16475096  0.46757679  0.79358974]
Accuracy 0.575291948371


# It looks better, however it's still not much above accuracy of the random classifier and barely better than Naive Bayes classifier.



# Test data classification


In [42]:
test_data = TwitterData()
test_data.initialize("kaggletest.csv", is_testing_set=True)
test_data.build_features()
test_data.cleanup(TwitterCleanuper())
test_data.tokenize()
test_data.stem()
test_data.build_wordlist()
test_data.data_model.head(5)

Unnamed: 0,original_id,number_of_uppercase,number_of_exclamation,number_of_question,number_of_ellipsis,number_of_hashtags,number_of_mentions,number_of_quotes,number_of_urls,number_of_positive_emo,...,topless_bow,flop_bow,scari_bow,attract_bow,pr_bow,sne_bow,harder_bow,sole_bow,rafe_bow,nc_bow
0,628949369883000832,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,628976607420645377,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,629023169169518592,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,629179223232479232,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,629186282179153920,1,0,1,0,2,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
