In [214]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag # need to open Python interpreter, run nltk.download('averaged_perceptron_tagger') to get tag set
import pandas as pd
from tqdm import tqdm
from numpy import mean

In [215]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,ID,TEXT,LABEL
0,7850790573542594519,If you love good films don't ever buy this pei...,2
1,9392069522632994700,The 33 percent of the nations nitwits that sti...,2
2,5083704536542443514,I saw Anatomy years ago -- dubbed at a friends...,1
3,12418349755186772171,Dark Remains is a home run plain and simple. T...,1
4,12144957944004619479,Feh. This movie started out in an interesting ...,2


In [216]:
from collections import Counter
Counter(df['LABEL']) # Sentiment data is balanced, but lots of non-reviews here

Counter({2: 18970, 1: 19276, 0: 32071})

In [217]:

def data_cleaning(text_list):
    # use lemmatizer to convert a word to its base form.
    # this will reduce our vocabulary and make our vectors denser
    lemmatizer = WordNetLemmatizer()

    # use tokenizer to split a string of text into tokens.
    # The Tweet tokenizer is built for a similar domain, and is is better than just splitting the text
    tokenizer = TweetTokenizer()

    process_texts=[]
    for text in tqdm(text_list):
        # case may be a usefule feature for datsets with with lots of ANGRY ALL CAPS, but I don't think that
        # outweighs normal use of case in this sataset. Better to normalize.
        tokens = tokenizer.tokenize(text.lower()) 
        lemmatized_tokens=[]
        
        pos_tags = pos_tag(tokens)
        for each_token, tag in pos_tags: # Normalize tags to feed to lemmatizer
            if tag.startswith('NN'):
                pos = 'n'
            elif tag.startswith('VB'):
                pos = 'v'
            elif tag.startswith('RB'):
                pos ='r'
            else:
                pos = 'a' # I don't think the distinction between satellite adjectives and adjectives is very meaningful here, so let's send them all to 'a'
            lemmatized_token = lemmatizer.lemmatize(each_token, pos)
            lemmatized_tokens.append(lemmatized_token)

        process_texts.append(' '.join(lemmatized_tokens))
    return process_texts

stops = ['the', 'to', 'and', 'a', 'in', 'it', 'is', 'I', 'that','this', 'had', 'on', 'for', 'were', 'was', 'are','he','she','it','they','them','his','her'
        'their','they''we','us','our','has','have','be','mine','of','who','whom','will'] # NLTK list too expansive, decresed performance. Made a custom list.
estimators=[('cleaner', FunctionTransformer(data_cleaning)),
            ('vectorizer', TfidfVectorizer(ngram_range=(1,3),stop_words=stops))] # tf-idf will add some nuance to the model
preprocessing_pipeline = Pipeline(estimators)

In [218]:
# Break data down into a training set and a validation set
x_train, x_valid, y_train, y_valid= train_test_split(df['TEXT'].astype(str), df['LABEL'], test_size=0.1, random_state=7)
x_train=preprocessing_pipeline.fit_transform(x_train)

100%|██████████| 63285/63285 [03:47<00:00, 277.57it/s]


In [219]:
# Create a Naive Bayes model and fit training data
print('fitting model')
model = MultinomialNB()
model.fit(x_train, y_train)
x_valid=preprocessing_pipeline.transform(x_valid)

fitting model


100%|██████████| 7032/7032 [00:24<00:00, 283.69it/s]


In [220]:
# Evaluate model - mean accuracy
print(f'Test Score: {model.score(x_valid, y_valid)}')

Test Score: 0.906712172923777


In [223]:
# put training and validation sets back together to pass to model
import scipy.sparse as sp
x = sp.vstack((x_train, x_valid), format='csr')
y = pd.concat([y_train, y_valid])

In [224]:
model = MultinomialNB() # I'm pressed for time, so let's use something simple.
model.fit(x, y)

In [225]:
# let's try this thing out. Read test data into datframe, pass it into preprocessing pipeline.
df_test = pd.read_csv('test.csv')
x_test=preprocessing_pipeline.transform(df_test['TEXT'].astype(str)) 


100%|██████████| 17580/17580 [13:30<00:00, 21.68it/s]   


In [226]:
pred = model.predict(x_test)

assert len(pred) == len(df_test)
data = {
    'ID': df_test['ID'],
    'LABEL': pred
}
df_result = pd.DataFrame(data=data)
df.to_csv('submission.csv', index=False) # making the submission file

In [227]:
# compare results to solution file
df_gold = pd.read_csv('solution.csv')
(df_gold['LABEL'] == df_result['LABEL']).astype(int).mean() # mean accuracy

0.9127986348122867

In [228]:
from sklearn.metrics import confusion_matrix
confusion_matrix(df_gold['LABEL'], df['LABEL'])

array([[7732,  212,  184],
       [ 190, 4075,  538],
       [  53,  356, 4240]])

In [229]:
from sklearn.metrics import f1_score, precision_score, accuracy_score
f1_w = f1_score(df_gold['LABEL'], df['LABEL'], average='weighted')
p_score_w = precision_score(df_gold['LABEL'], df['LABEL'], average='weighted')
a_score = accuracy_score(df_gold['LABEL'], df['LABEL'])

f1_m = f1_score(df_gold['LABEL'], df['LABEL'], average='macro')
p_score_m = precision_score(df_gold['LABEL'], df['LABEL'], average='macro')

f1_mi = f1_score(df_gold['LABEL'], df['LABEL'], average='micro')
p_score_mi = precision_score(df_gold['LABEL'], df['LABEL'], average='micro')

f1_avg_meta = mean([f1_w,f1_m,f1_mi])
p_score_avg_meta = mean([p_score_w,p_score_m,p_score_mi])


print(f"Weighted\nF1: {f1_w}\nPrecision: {p_score_w}\nAccuracy: {a_score}\n\nMacro\nF1: {f1_m}\nPrecision: {p_score_m}\nAccuracy: {a_score}\n\nMicro\nF1: {f1_mi}\nPrecision: {p_score_mi}\nAccuracy: {a_score}\n\nF1 Meta: {f1_avg_meta}\nPrecision Meta: {p_score_avg_meta}")

Weighted
F1: 0.9130491924253523
Precision: 0.9140106845361778
Accuracy: 0.9127986348122867

Macro
F1: 0.9018131201839136
Precision: 0.9005630795842544
Accuracy: 0.9127986348122867

Micro
F1: 0.9127986348122867
Precision: 0.9127986348122867
Accuracy: 0.9127986348122867

F1 Meta: 0.9092203158071842
Precision Meta: 0.9091241329775729


In [230]:
'''
4-grams

Weighted
F1: 0.9142707753902352
Precision: 0.9148633956771518
Accuracy: 0.9142207053469852

Macro
F1: 0.9029814288808939
Precision: 0.9025071115573824
Accuracy: 0.9142207053469852

Micro
F1: 0.9142207053469852
Precision: 0.9142207053469852
Accuracy: 0.9142207053469852

F1 Meta: 0.9104909698727047
Precision Meta: 0.9105304041938398
'''

'\n4-grams\n\nWeighted\nF1: 0.9142707753902352\nPrecision: 0.9148633956771518\nAccuracy: 0.9142207053469852\n\nMacro\nF1: 0.9029814288808939\nPrecision: 0.9025071115573824\nAccuracy: 0.9142207053469852\n\nMicro\nF1: 0.9142207053469852\nPrecision: 0.9142207053469852\nAccuracy: 0.9142207053469852\n\nF1 Meta: 0.9104909698727047\nPrecision Meta: 0.9105304041938398\n'