In [14]:
import pandas as pd
import numpy as np
import re
import string

In [3]:
train = pd.read_csv('train.tsv', sep='\t')
test = pd.read_csv('test.tsv', sep='\t')

test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


# First Attempt

First, we extracted only single words from the training data and saved their sentiment into a dictionary. From our data exploration, we noticed that entire sentences could consist of mostly neutral words, but one key positive or negative term will completely change the sentiment of the sentence. So, for each test sentence, we calculated the sentiment by extracting all the words and just using the sentiment of the word with the most extreme sentiment. It's also worth noting that all of the words in the test data are in the corpus.

In [4]:
# Extracts one-word phrases using a regular expression
regexp = re.compile(' ')
single_words = train[~train.Phrase.str.contains(' ')]
single_words = single_words[single_words.Phrase.str.contains('^[a-zA-Z]+$')]
single_words.Phrase = single_words.Phrase.str.lower()

# Creates a dictionary mapping words -> sentiment
phrase_iter = single_words.Phrase.values
sent_iter = single_words.Sentiment.values
corpus = dict(zip(phrase_iter, sent_iter))

The below code finds the word with the most extreme sentiment and sets the entire sentence's sentiment to be equal to that value.

In [7]:
test_sentence = test[:1]

''' Splits the phrase into a list of words '''
def get_words(phrase):
    words = phrase.split()
    new_words = []
    for w in words:
        new_words.append(w.lower().strip('.,;!?"()'))
    
    return new_words

''' Gets the most extreme sentiment value in the sentence and returns it '''
def get_sentiment(phrase):
    words = get_words(phrase)
    most_sentiment = 2
    
    for w in words:
        if w in corpus:
            sentiment = corpus[w]
            if abs(sentiment - 2) >= abs(most_sentiment - 2):
                most_sentiment = sentiment
        
    return most_sentiment

# Predicts the first 5 sentiment values
test.Phrase.apply(get_sentiment)[:5]

0    4
1    4
2    2
3    4
4    4
Name: Phrase, dtype: int64

After applying this to the training data, we actually got a relatively high accuracy.

In [5]:
train_predictions = train.Phrase.apply(get_sentiment)
print "Train accuracy: " + str(len(train[train.Sentiment == train_predictions]) / float(len(train)))

Train accuracy: 0.53044982699


In [8]:
# Generates the csv file
predictions = test.Phrase.apply(get_sentiment)
submission = pd.DataFrame({
        "PhraseId": test["PhraseId"],
        "Sentiment": predictions
    })

submission.to_csv("submission_iteration1.csv", index=False)

Using this model, we achieved an accuracy on Kaggle of 54.682%.

# Second Attempt

In an attempt to improve our model, we tried applying the Naive Bayes model to the data. In order to calculate $P(c\:|\:w)$, the probability of the words belonging to a particular sentiment class, we calculated $P(c)$, the probability of each sentiment class occurring in the data, and $P(w\: |\: c)$, the probability of a particular word appearing in a sentiment class.

In [13]:
# Calculates the probability of any given phrase having a particular sentiment
P_c = []
for i in range(5):
    prob = len(train[train.Sentiment == i]) / float(len(train))
    P_c.append(prob)
    
print "Pr{Sentiment 0}: " + str(P_c[0])[:6]
print "Pr{Sentiment 1}: " + str(P_c[1])[:6]
print "Pr{Sentiment 2}: " + str(P_c[2])[:6]
print "Pr{Sentiment 3}: " + str(P_c[3])[:6]
print "Pr{Sentiment 4}: " + str(P_c[4])[:6]

Pr{Sentiment 0}: 0.0453
Pr{Sentiment 1}: 0.1747
Pr{Sentiment 2}: 0.5099
Pr{Sentiment 3}: 0.2109
Pr{Sentiment 4}: 0.0589


In [17]:
# Extracts just the sentences from the dataset and combines them into mega-strings
sentences_dict = train['SentenceId'].drop_duplicates()
sentences = train.iloc[sentences_dict.keys()]
sentences.head()

# For each sentiment's mega-string, counts the number of times each word appears
text = []
for i in range(5):
    d = {}
    temp = sentences[sentences.Sentiment == i]
    words_in_class = ' '.join(temp.Phrase.values).lower().split()
    for w in words_in_class:
        new_word = w.lower().strip('.,;!?"()')
        if not new_word.strip(string.ascii_letters) and new_word != '':
            d[new_word] = d.get(new_word,0) + 1

    text.append(d)

print "How many times does the word \"exciting\" appear in sentiment 4 phrases?"
print text[4]['exciting']

How many times does the word "exciting" appear in sentiment 4 phrases?
8


In [19]:
# Calculates the probability of a word being in a particular class for every word in the corpus
P_wc = []
alpha = 1.0 # Laplace smoothing
for d in text:
    n_words_in_class = sum(d.values())
    new_dict = {}
    for w in corpus:
        if w in d:
            new_dict[w] = (d[w] + alpha) / (n_words_in_class + len(corpus))
        else:
            new_dict[w] = alpha / len(corpus)
    P_wc.append(new_dict)
    
print "Probability of the word \"yellow\" appearing in a sentiment 2 phrase: "
print P_wc[2]['yellow']

Probability of the word "yellow" appearing in a sentiment 2 phrase: 
5.01957634776e-05


Now that the right probability values have been calculated, the Naive Bayes method must first extract all of the words in the phrase. For each sentiment class, the algorithm gets $P(w\:|\:c)$ for each word in the phrase and multiplies all of the probabilities together. This product then gets multiplied by $P(c)$. The overall equation looks like the following:

$c = \max_\limits{c\:\in\:C}\:\Big( P(c)\: \cdot \prod_\limits{x\:\in\:X} P(x\:|\:c)\Big)$

In [22]:
''' Calculates the sentiment of a phrase using a Naive bayes classifier '''
def naive_bayes(phrase):
    words = get_words(phrase)
    P_cw = 0 # Probability that the phrase has a sentiment c 
    
    for s in range(5):
        # Probability of the sentiment occurring
        prob = P_c[s]
        
        # Probability of the word appearing in the sentiment class
        for w in words:
            if w in corpus:
                prob *= P_wc[s][w]
            
        if prob > P_cw:
            P_cw = prob
            sentiment = s
        
    return sentiment

After applying the Naive Bayes classifier to the training data, we got about the same accuracy as for the first attempt.

In [23]:
train_predictions = train.Phrase.apply(naive_bayes)
print "Train accuracy: " + str(len(train[train.Sentiment == train_predictions]) / float(len(train)))

Train accuracy: 0.522516980648


In [24]:
# Generates the csv file
predictions = test.Phrase.apply(naive_bayes)
submission = pd.DataFrame({
        "PhraseId": test["PhraseId"],
        "Sentiment": predictions
    })

submission.to_csv("submission_iteration2.csv", index=False)

The score we got on Kaggle for this submission was 52.682%, which is slightly lower than before.