In [1121]:
import pandas as pd
import numpy as numpy
import glob

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures

from itertools import islice

In [1122]:
pos_root = './pos'
neg_root = './neg'

pos_files = glob.glob(pos_root+'/*.txt')
neg_files = glob.glob(neg_root+'/*.txt')

pos_corpus = []
for pos_file in pos_files:
    pos_corpus.append(open(pos_file).read())
neg_corpus = []
for neg_file in neg_files:
    neg_corpus.append(open(neg_file).read())

# remove unnecessary spaces
for i in range(len(pos_corpus)):
    pos_corpus[i] = TreebankWordDetokenizer().detokenize(word_tokenize(pos_corpus[i]))
    #" ".join(pos_corpus[i].split())
for i in range(len(neg_corpus)):
    neg_corpus[i] = TreebankWordDetokenizer().detokenize(word_tokenize(neg_corpus[i]))
    #" ".join(neg_corpus[i].split())
    
pos_data = pd.DataFrame(zip(pos_corpus, ['pos' for i in range(len(pos_corpus))]), columns=['text', 'sentiment'])
neg_data = pd.DataFrame(zip(neg_corpus, ['neg' for i in range(len(neg_corpus))]), columns=['text', 'sentiment'])

data = pd.concat([pos_data, neg_data])
data = data.sample(frac=1).reset_index(drop=True) # shuffle the rows

In [1123]:
data.head(5)

Unnamed: 0,text,sentiment
0,I have been trying to find a way to easily car...,pos
1,Everything your read about the picture quality...,neg
2,Researched a lot before I bought this TV and f...,pos
3,Memorex DVD+R's are the best . Price keeps dro...,pos
4,If you have a cable modem for broadband connec...,neg


### Lemmatization

In [1124]:
X_lem = data['text'].copy()
y_lem = data['sentiment'].copy()

lem = WordNetLemmatizer()

for i in range(len(X_lem)):
    words = word_tokenize(X_lem[i])
    pos_labels = pos_tag(words)
    
    for j in range(len(words)):
        
        pos_label = pos_labels[j][1][0].lower()
#         pos_refs = {'n': ['NN', 'NNS', 'NNP', 'NNPS'],
#                     'v': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
#                     'r': ['RB', 'RBR', 'RBS'],
#                     'a': ['JJ', 'JJR', 'JJS']}

        if pos_label == 'j':
            pos_label = 'a'    # 'j' <--> 'a' reassignment for adjectives
        
        if pos_label in ['r']:  # for adverbs it's a bit different.
            try:
                # when a word doesn't have pertainym, use lemma
                pertainym = wordnet.synset(words[j]+'.r.1').lemmas()[0].pertainyms()
                if pertainym:
                    words[j] = pertainym[0].name()
                else:
                    words[j] = wordnet.synset(words[j]+'.r.1').lemmas()[0].name()
            except: # when a word doesn't have lemma, use original word
                words[j] = words[j]
        
        elif pos_label in ['a', 's', 'v']: # for adjectives and verbs
            words[j] = lem.lemmatize(words[j], pos=pos_label)
        
        else:   # for nouns and everything else as it is the default kwarg
            words[j] = lem.lemmatize(words[j])
            
    X_lem[i] = TreebankWordDetokenizer().detokenize(words)

In [1207]:
print(X[3])
print('---------------------------------------------------------------------')
print(X_lem[3])

Memorex DVD+R's are the best . Price keeps dropping and never have a problem
---------------------------------------------------------------------
Memorex DVD+R's be the best . Price keep drop and never have a problem


### Apply TF-IDF transformation with all combinations of lemmatization and stop words removal

In [1125]:
X = data['text'].copy()
y = data['sentiment'].copy()

vectorizer = TfidfVectorizer(lowercase=True)
vectorizer.fit(X)
X_tfidf = vectorizer.transform(X)

In [1126]:
vectorizer = TfidfVectorizer(lowercase=True)
vectorizer.fit(X_lem)
X_lem_tfidf = vectorizer.transform(X_lem)

In [1127]:
stopwords_list = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", 
                  "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 
                  'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 
                  'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 
                  'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 
                  'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 
                  'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 
                  'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 
                  'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 
                  'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'own', 'same', 
                  'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'should', "should've", 'now', 'd', 
                  'll', 'm', 'o', 're', 've', 'y', 'ma']

vectorizer = TfidfVectorizer(stop_words=stopwords_list, lowercase=True)
vectorizer.fit(X_lem)
X_lem_sw_tfidf = vectorizer.transform(X_lem)

In [1128]:
vectorizer = TfidfVectorizer(stop_words=stopwords_list, lowercase=True)
vectorizer.fit(X)
X_sw_tfidf = vectorizer.transform(X)

### Pointwise Mutual Information

In [1129]:
all_words_sw = [word.lower()
                for review in X
                for word in review.replace('.', '').replace(',', '').split()
                if not word.lower() in stopwords_list
               ]

finder = BigramCollocationFinder.from_words(all_words_sw)
bgm = BigramAssocMeasures()
score = bgm.mi_like  # metric options: pmi or mi_like
collocations = {'_'.join(bigram): pmi for bigram, pmi in finder.score_ngrams(score)}

list(islice(collocations.items(), 30)) # return word pairs with highest scores

[('customer_service', 14.187192118226601),
 ('caller_id', 11.834319526627219),
 ('escort_radar', 10.285714285714286),
 ('&_quot;', 8.937931034482759),
 ('tech_support', 8.28132727127702),
 ('does_not', 7.880766601696834),
 ('highly_recommend', 6.740720221606648),
 ('altec_lansing', 6.4935064935064934),
 ('sound_quality', 6.331670195763461),
 ('battery_life', 5.889407407407408),
 ('waste_money', 5.797083485235144),
 ('mp3_player', 5.5826083695815205),
 ('fiber_optic', 5.142857142857143),
 ('belt_clip', 4.71889400921659),
 ('do_not', 4.44265687870339),
 ('timely_manner', 4.166666666666667),
 ('bells_whistles', 4.0),
 ('mowing_lawn', 4.0),
 ('nvidia_quadrofx', 3.7925925925925927),
 ('ip_address', 3.6134868421052633),
 ('windows_xp', 3.2015289525048796),
 ('harman_kardon', 3.2),
 ('polycom_communicator', 3.125),
 ('ique_3600', 3.0625),
 ('(pci_express)', 3.0),
 ('horror_stories', 3.0),
 ('circuit_city', 2.9761904761904763),
 ('dvd_player', 2.860169491525424),
 ('nvidia_geforce', 2.85833333

### Compare models and fine tune the best model

In [1130]:
def print_scores(scores):
    k = len(scores['test_precision_macro'])
    print('test_precision_macro:    ' + str(sum(scores['test_precision_macro']) / k))
    print('test_recall_macro:       ' + str(sum(scores['test_recall_macro']) / k))
    print('test_f1_macro:           ' + str(sum(scores['test_f1_macro']) / k))
    print('test_precision_weighted: ' + str(sum(scores['test_precision_weighted']) / k))
    print('test_recall_weighted:    ' + str(sum(scores['test_recall_weighted']) / k))
    print('test_f1_weighted:        ' + str(sum(scores['test_f1_weighted']) / k))
    
scoring = ['precision_macro', 'recall_macro', 'f1_macro',
           'precision_weighted', 'recall_weighted', 'f1_weighted']

#### No lemmatization nor stop words removal

In [1131]:
print('MultinomialNB')
nb_model = MultinomialNB()

scores = cross_validate(nb_model, X_tfidf, y, scoring=scoring, cv=5)
print_scores(scores)
print('-------------------------------------------')


print('LogisticRegression')
lr_model = LogisticRegression()

scores = cross_validate(lr_model, X_tfidf, y, scoring=scoring, cv=5)
print_scores(scores)
print('-------------------------------------------')


print('SGDClassifier')
sgd_model = SGDClassifier()

scores = cross_validate(sgd_model, X_tfidf, y, scoring=scoring, cv=5)
print_scores(scores)
print('-------------------------------------------')


print('RandomForestClassifier')
rf_model = RandomForestClassifier()

scores = cross_validate(rf_model, X_tfidf, y, scoring=scoring, cv=5)
print_scores(scores)
print('-------------------------------------------')

MultinomialNB
test_precision_macro:    0.8095240096178978
test_recall_macro:       0.8074999999999999
test_f1_macro:           0.8071715969042519
test_precision_weighted: 0.8095240096178978
test_recall_weighted:    0.8075000000000001
test_f1_weighted:        0.8071715969042519
-------------------------------------------
LogisticRegression
test_precision_macro:    0.8217443569197549
test_recall_macro:       0.8215
test_f1_macro:           0.8214654015692169
test_precision_weighted: 0.8217443569197549
test_recall_weighted:    0.8215000000000001
test_f1_weighted:        0.8214654015692169
-------------------------------------------
SGDClassifier
test_precision_macro:    0.8076322554685197
test_recall_macro:       0.8065
test_f1_macro:           0.8063215088534814
test_precision_weighted: 0.8076322554685197
test_recall_weighted:    0.8065
test_f1_weighted:        0.8063215088534814
-------------------------------------------
RandomForestClassifier
test_precision_macro:    0.804352716731236

#### Only lemmatization

In [1132]:
lr_model = LogisticRegression()

scores = cross_validate(lr_model, X_lem_tfidf, y, scoring=scoring, cv=5)
print_scores(scores)

test_precision_macro:    0.8079471286458528
test_recall_macro:       0.8074999999999999
test_f1_macro:           0.807434632573008
test_precision_weighted: 0.8079471286458528
test_recall_weighted:    0.8074999999999999
test_f1_weighted:        0.807434632573008


#### Lemmatization + stop words removal

In [1133]:
lr_model = LogisticRegression()

scores = cross_validate(lr_model, X_lem_sw_tfidf, y, scoring=scoring, cv=5)
print_scores(scores)

test_precision_macro:    0.8147274192737092
test_recall_macro:       0.8140000000000001
test_f1_macro:           0.8138971996152471
test_precision_weighted: 0.814727419273709
test_recall_weighted:    0.8140000000000001
test_f1_weighted:        0.8138971996152471


#### Only stop words removal

In [1134]:
lr_model = LogisticRegression()

scores = cross_validate(lr_model, X_sw_tfidf, y, scoring=scoring, cv=5)
print_scores(scores)

test_precision_macro:    0.8242027197314539
test_recall_macro:       0.8234999999999999
test_f1_macro:           0.8233981437841214
test_precision_weighted: 0.8242027197314539
test_recall_weighted:    0.8234999999999999
test_f1_weighted:        0.8233981437841214


#### Only unigram + bigram

In [1195]:
vectorizer_ngram = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), max_features=10000)
vectorizer_ngram.fit(X)
X_ngram_tfidf = vectorizer_ngram.transform(X)

lr_model = LogisticRegression()

scores = cross_validate(lr_model, X_ngram_tfidf, y, scoring=scoring, cv=5)
print_scores(scores)

test_precision_macro:    0.8323720189285286
test_recall_macro:       0.8320000000000001
test_f1_macro:           0.8319502431526324
test_precision_weighted: 0.8323720189285286
test_recall_weighted:    0.8320000000000001
test_f1_weighted:        0.8319502431526324


#### Best model

In [1196]:
vectorizer_ngram_final = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), max_features=11000)
vectorizer_ngram_final.fit(X)
X_ngram_tfidf_final = vectorizer_ngram_final.transform(X)

lr_model_final = LogisticRegression(C=12)

scores = cross_validate(lr_model_final, X_ngram_tfidf_final, y, scoring=scoring, cv=5)
print_scores(scores)

test_precision_macro:    0.8506426090164687
test_recall_macro:       0.85
test_f1_macro:           0.8499248086747737
test_precision_weighted: 0.8506426090164687
test_recall_weighted:    0.85
test_f1_weighted:        0.8499248086747737


### Most informative features

In [1197]:
lr_model = LogisticRegression(C=12)
lr_model.fit(X_ngram_tfidf_final, y)

feature_names = vectorizer_ngram_final.get_feature_names() 
coefs_with_fns = sorted(zip(lr_model.coef_[0], feature_names)) 
coef_word=pd.DataFrame(coefs_with_fns)
coef_word.columns='coefficient','word'
most_pos = coef_word.sort_values(by='coefficient', ascending=True).head(20).reset_index(drop=True)
most_neg = coef_word.sort_values(by='coefficient', ascending=False).head(20).reset_index(drop=True)
pd.concat([most_pos, most_neg], axis=1)

Unnamed: 0,coefficient,word,coefficient.1,word.1
0,-6.953901,not,6.973223,great
1,-4.587459,return,5.042313,excellent
2,-4.380018,poor,4.608211,price
3,-4.14323,back,4.54338,perfect
4,-3.73764,terrible,3.904549,the best
5,-3.681787,not work,3.894718,highly
6,-3.648012,after,3.722976,best
7,-3.627613,returned,3.719372,memory
8,-3.458124,bad,3.536671,as
9,-3.404955,waste,3.480402,for
