In [2]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tag.perceptron import PerceptronTagger
import pattern.en

ImportError: No module named pattern.en

In [3]:
train = pd.read_csv('train.tsv', sep='\t')
test = pd.read_csv('test.tsv', sep='\t')

test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


# First Attempt

First, we extracted only single words from the training data and saved their sentiment into a dictionary. From our data exploration, we noticed that entire sentences could consist of mostly neutral words, but one key positive or negative term will completely change the sentiment of the sentence. So, for each test sentence, we calculated the sentiment by extracting all the words and just using the sentiment of the word with the most extreme sentiment. It's also worth noting that all of the words in the test data are in the corpus.

In [84]:
# Extracts one-word phrases using a regular expression
single_words = train[~train.Phrase.str.contains(' ')]
single_words = single_words[single_words.Phrase.str.contains('^[a-zA-Z]+$')]
single_words.Phrase = single_words.Phrase.str.lower()

# Creates a dictionary mapping words -> sentiment
phrase_iter =single_words.Phrase.values
sent_iter = single_words.Sentiment.values
corpus = dict(zip(phrase_iter, sent_iter))

In [85]:
import re

def prepend_NOT(match):
    """
    A function that feeds into a regular expression substitution function
    that prepends all words after a negation word (i.e. "didn't" and
    "not") with "NOT_".
    """
    match = match.group()
    words = match.split(" ")
    negation = words[0]
    del words[0]
    new_words = ["NOT_" + word for word in words]
    return negation + " " + " ".join(new_words)


def substitute_negations(phrase):
    """
    Replaces input phrase with the same phrase, except prepending a "NOT_"
    for every word after a negation word (i.e. "didn't" and "not"). This
    can only occur in phrases with more than one word.
    """
    # negation_words is a list of regular expressions
    negation_words = [r"not", r"n't", r"no"]
    
    # negation_words then gets turned into a regular expression string
    negation_words = [r"(" + word + r")" for word in negation_words]
    negation_words = (r"|").join(negation_words)
    
    negations_re = re.compile(r"(" + negation_words + r")[A-z ']*")
    substitution = negations_re.sub(prepend_NOT, phrase)
    
    if substitution == "":
        return phrase
    return substitution


def add_NOT_to_negations(df):
    """
    Replaces each phrase in the dataframe with the same phrase, but
    replacing every word after a negation word (i.e. "didn't" and "not")
    with "NOT_" prepended to the word. This can only occur in phrases
    with more than one word.
    """
    data = df
    data["Negations"] = data["Phrase"].apply(lambda x: substitute_negations(x))
    return data

train = add_NOT_to_negations(train)
test = add_NOT_to_negations(test)

In [86]:
# Generates a corpus of all the negated words
not_corpus = {}
for w in corpus:
    words = w.split()
    if len(words) == 1:
        not_corpus["NOT_" + w] = abs(4 - corpus[w])
    elif len(words) == 2:
        not_corpus["NOT_" + words[0] + " NOT_" + words[1]] = abs(4 - corpus[w])
    
print "not good = negative sentiment: " + str(not_corpus["NOT_good"])
print "not bad = positive sentiment: " + str(not_corpus["NOT_bad"])

not good = negative sentiment: 1
not bad = positive sentiment: 4


In [87]:
# Calculates the probability of any given phrase having a particular sentiment
P_c = []
for i in range(5):
    prob = len(train[train.Sentiment == i]) / float(len(train))
    P_c.append(prob)
    
print "Pr{Sentiment 0}: " + str(P_c[0])[:6]
print "Pr{Sentiment 1}: " + str(P_c[1])[:6]
print "Pr{Sentiment 2}: " + str(P_c[2])[:6]
print "Pr{Sentiment 3}: " + str(P_c[3])[:6]
print "Pr{Sentiment 4}: " + str(P_c[4])[:6]

Pr{Sentiment 0}: 0.0453
Pr{Sentiment 1}: 0.1747
Pr{Sentiment 2}: 0.5099
Pr{Sentiment 3}: 0.2109
Pr{Sentiment 4}: 0.0589


In [88]:
# Extracts just the sentences from the dataset and combines them into mega-strings
sentences_dict = train['SentenceId'].drop_duplicates()
sentences = train.iloc[sentences_dict.keys()]
sentences.head()

# For each sentiment's mega-string, counts the number of times each word appears
text = []
for i in range(5):
    d = {}
    temp = sentences[sentences.Sentiment == i]
    words_in_class = ' '.join(temp.Phrase.values).lower().split()
    for w in words_in_class:
        new_word = w.lower().strip('.,;!?"()')
        if not new_word.strip(string.ascii_letters) and new_word != '':
            d[new_word] = d.get(new_word,0) + 1

    text.append(d)

print "How many times does the word \"exciting\" appear in sentiment 4 phrases?"
print text[4]['exciting']

How many times does the word "exciting" appear in sentiment 4 phrases?
8


In [89]:
# Calculates the probability of a word being in a particular class for every word in the corpus
P_wc = []
alpha = 1.0 # Laplace smoothing
for d in text:
    n_words_in_class = sum(d.values())
    new_dict = {}
    for w in corpus:
        if w in d:
            new_dict[w] = (d[w] + alpha) / (n_words_in_class + alpha*len(corpus))
        else:
            new_dict[w] = alpha / len(corpus)
    for w in not_corpus:
        if w in d:
            new_dict[w] = (d[w] + alpha) / (n_words_in_class + alpha*len(corpus))
        else:
            new_dict[w] = alpha / len(corpus)
    P_wc.append(new_dict)
    
print "Probability of the word \"yellow\" appearing in a sentiment 2 phrase: "
print P_wc[2]['yellow']

Probability of the word "yellow" appearing in a sentiment 2 phrase: 
5.01957634776e-05


Now that the right probability values have been calculated, the Naive Bayes method must first extract all of the words in the phrase. For each sentiment class, the algorithm gets $P(w\:|\:c)$ for each word in the phrase and multiplies all of the probabilities together. This product then gets multiplied by $P(c)$. The overall equation looks like the following:

$c = \max_\limits{c\:\in\:C}\:\Big( P(c)\: \cdot \prod_\limits{x\:\in\:X} P(x\:|\:c)\Big)$

In [107]:
''' Splits the phrase into a list of words '''
def get_words(phrase):
    words = phrase.split()
    new_words = []
    for w in words:
        new_words.append(w.lower().strip('.,;!?"()'))
    
    return new_words

''' Calculates the sentiment of a phrase using a Naive bayes classifier '''
def naive_bayes(phrase):
    if phrase in corpus:
        return corpus[phrase]
    if phrase in not_corpus:
        return not_corpus[phrase]
    
    P_cw = 0 # Probability that the phrase has a sentiment c
    words = get_words(phrase)
    
    for s in range(5):
        # Probability of the sentiment occurring
        prob = P_c[s]
        
        # Probability of the word appearing in the sentiment class
        for w in words:
            if w in corpus or w in not_corpus:
                prob *= P_wc[s][w]
            
        if prob > P_cw:
            P_cw = prob
            sentiment = s
            
    if sentiment == 2:
        pattern_sent = pattern.en.sentiment(phrase)[0]
        if abs(pattern_sent) > 0.1:
            sentiment = int(round(pattern_sent * 2)) + sentiment
        
    return sentiment

After applying the Naive Bayes classifier to the training data, we got about the same accuracy as for the first attempt.

In [108]:
train["Predictions"] = train.Phrase.apply(naive_bayes)
print "Train accuracy: " + str(len(train[train.Sentiment == train.Predictions]) / float(len(train)))

Train accuracy: 0.54651416122


In [109]:
# Generates the csv file
predictions = test.Phrase.apply(naive_bayes)
submission = pd.DataFrame({
        "PhraseId": test["PhraseId"],
        "Sentiment": predictions
    })

submission.to_csv("final_iteration.csv", index=False)

The score we got on Kaggle for this submission was 52.682%, which is slightly lower than before.

# Part of Speech Tagging

Using NLTK's tagging package, we created a new Pandas series that stores the part of speech of each word in the sentence. Though we weren't able to to do much with that data, we think it will come in handy for a future iteration.

In [4]:
# Load the tagger for faster tagging
tagger = PerceptronTagger()

def tag_all_phrases(df):
    data = df
    data["POS"] = data["Phrase"].apply(
        lambda x: [tag[1] for tag in \
                   nltk.tag._pos_tag(nltk.word_tokenize(x), None, tagger)] )
    return data

def get_words(df):
    data = df[~(df['Phrase'].str.contains(' '))]
    return data

def extract_single_POS_from_words(df):
    data = df
    data["POS2"] = data["POS"].apply(lambda x: x[0])
    return data

def make_phrase_POS_columns(data, parts_of_speech):
    df = data
    for POS in parts_of_speech:
        df[POS] = [np.asarray([1 if p==POS else 0 for p in POS_list], dtype=int)\
                   for POS_list in df['POS']]
    return df