In [81]:
import os.path as op
import numpy as np
import string

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

#import nltk
from nltk import SnowballStemmer
from nltk import pos_tag

In [82]:
# Load data
print("Loading dataset")

from glob import glob
filenames_neg = sorted(glob(op.join('..', 'data', 'imdb1', 'neg', '*.txt')))
filenames_pos = sorted(glob(op.join('..', 'data', 'imdb1', 'pos', '*.txt')))
filenames_stop_words = sorted(glob(op.join('..','data','english.stop')))


stop_words_list = [open(f).read() for f in filenames_stop_words][0].split()

texts_neg = [open(f).read() for f in filenames_neg]
texts_pos = [open(f).read() for f in filenames_pos]
texts = texts_neg + texts_pos
y = np.ones(len(texts), dtype=np.int)
y[:len(texts_neg)] = 0.

print("%d documents" % len(texts))
print("punctuation = ", string.punctuation)

Loading dataset
2000 documents
punctuation =  !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [168]:
### Preprocess

def text_process(texts, stem = False, tag = False):
    """
        Takes in a list of texts, then performs the following:
        1. Remove all punctuation for each text
        2. Remove all stopwords for each text
        3. Returns the cleaned texts
        
        if stem == True, change each word to its stem.
        if tag == True, we'll keep only noun, adverbs, verbs and adjectives.
    """
    cleaned_texts = []
    keep_cat = ['NN','ADV','VB','JJ']
    for text in texts :
        list_text = []
        # Check characters to see if they are in punctuation
        nopunc = [char for char in text if char not in string.punctuation]
    
        # Join the characters again to form the string.
        nopunc = ''.join(nopunc)
    
        # Now just remove any stopwords and stem words
        list_words = nopunc.split()
        if stem == False :
            if tag == False :
                list_text = [word for word in list_words if word.lower() not in stop_words_list] #Question 2
            else :
                for word in nopunc.split() :
                    print(pos_tag([stemmer.stem(word)]))
                list_text = [word for word in list_words if word.lower() not in stop_words_list] #Question 4
                #print(list_text)
        else :
            if tag == False :
                list_text = [stemmer.stem(word) for word in list_words if word.lower() not in stop_words_list] #Question 3
            else :
                list_text = [stemmer.stem(word) for word in list_words if (word.lower() not in stop_words_list and pos_tag([word])[0][1] not in keep_cat)] #Question 3
                
                    
        new_text = ' '.join(list_text)
        cleaned_texts.append(new_text)
       
    return cleaned_texts


In [169]:
stemmer = SnowballStemmer("english")
vectorizer = CountVectorizer()

In [170]:
texts_processed = text_process(texts, stem = False, tag = False)
X = vectorizer.fit_transform(texts_processed)

In [171]:
texts_processed_stem = text_process(texts, stem = True, tag = False)
X_stem = vectorizer.fit_transform(texts_processed_stem)

In [172]:
texts_processed_stem_tag = text_process(texts, stem = True, tag = True)
X_stem_tag = vectorizer.fit_transform(texts_processed_stem_tag)

In [173]:
print(X.shape)
print(X_stem.shape)
print(X_stem_tag.shape)

(2000, 47026)
(2000, 31375)
(2000, 12700)


In [177]:
#Naive Bayesien
from sklearn.naive_bayes import MultinomialNB
print("Testing Naive Bayesien with cross validation 5-folds: ")
 

clf = MultinomialNB()

print("Mean accuracy with no stem and no tag = {acc} %".format(acc = 100*cross_val_score(clf,X,y, cv = 5).mean()))

print("Mean accuracy with stem and no tag = {acc} %".format(acc = 100*cross_val_score(clf,X_stem,y, cv = 5).mean()))
   
print("Mean accuracy with stem and tag = {acc} %".format(acc = 100*cross_val_score(clf,X_stem_tag,y, cv = 5).mean()))


Testing Naive Bayesien with cross validation 5-folds: 
Mean accuracy with no stem and no tag = 80.74999999999999 %
Mean accuracy with stem and no tag = 80.4 %
Mean accuracy with stem and tag = 76.55000000000001 %


In [179]:
# Linear SVC
from sklearn.svm import LinearSVC
print("Testing Linear SVC : ")

for c in [0.1, 0.5, 1, 1.5, 3] :
    print("====== Testing for c = %f ======"%c)
    clf = LinearSVC(C = c)
    print("Mean accuracy with no stem and no tag = {acc} %".format(acc = 100*cross_val_score(clf,X,y, cv = 5).mean()))
    print("Mean accuracy with stem and no tag = {acc} %".format(acc = 100*cross_val_score(clf,X_stem,y, cv = 5).mean()))
    print("Mean accuracy with stem and tag = {acc} %".format(acc = 100*cross_val_score(clf,X_stem_tag,y, cv = 5).mean()))


Testing Linear SVC : 
Mean accuracy with no stem and no tag = 82.5 %
Mean accuracy with stem and no tag = 81.64999999999999 %
Mean accuracy with stem and tag = 73.5 %
Mean accuracy with no stem and no tag = 82.4 %
Mean accuracy with stem and no tag = 81.4 %
Mean accuracy with stem and tag = 73.05 %
Mean accuracy with no stem and no tag = 82.25 %
Mean accuracy with stem and no tag = 81.30000000000001 %
Mean accuracy with stem and tag = 72.89999999999999 %
Mean accuracy with no stem and no tag = 82.15 %
Mean accuracy with stem and no tag = 81.25 %
Mean accuracy with stem and tag = 72.85000000000001 %
Mean accuracy with no stem and no tag = 82.2 %
Mean accuracy with stem and no tag = 81.0 %
Mean accuracy with stem and tag = 72.8 %


In [180]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
print("Testing LogisticRegression : ")
clf = LogisticRegression()
print("Mean accuracy with no stem and no tag = {acc} %".format(acc = 100*cross_val_score(clf,X,y, cv = 5).mean()))
print("Mean accuracy with stem and no tag = {acc} %".format(acc = 100*cross_val_score(clf,X_stem,y, cv = 5).mean()))   
print("Mean accuracy with stem and tag = {acc} %".format(acc = 100*cross_val_score(clf,X_stem_tag,y, cv = 5).mean()))


Testing LogisticRegression : 
Mean accuracy with no stem and no tag = 83.05 %
Mean accuracy with stem and no tag = 82.5 %
Mean accuracy with stem and tag = 74.85000000000001 %


In [200]:
# Gradient Boosting
from catboost import CatBoostClassifier
print("Testing catboost : ")
clf = CatBoostClassifier(max_depth = 5, loss_function = 'CrossEntropy', learning_rate = 0.4, n_estimators = 100, early_stopping_rounds = 10)
print("Mean accuracy with no stem and no tag = {acc} %".format(acc = 100*cross_val_score(clf,X.toarray(),list(y), cv = 5).mean()))


Testing catboost : 
0:	learn: 0.6684747	total: 1.39s	remaining: 2m 17s
1:	learn: 0.6424381	total: 2.16s	remaining: 1m 45s
2:	learn: 0.6283759	total: 2.94s	remaining: 1m 35s
3:	learn: 0.6120352	total: 3.61s	remaining: 1m 26s
4:	learn: 0.5984274	total: 4.26s	remaining: 1m 20s
5:	learn: 0.5868165	total: 4.88s	remaining: 1m 16s
6:	learn: 0.5819870	total: 5.49s	remaining: 1m 13s
7:	learn: 0.5721067	total: 6.21s	remaining: 1m 11s
8:	learn: 0.5581243	total: 6.89s	remaining: 1m 9s
9:	learn: 0.5524790	total: 7.55s	remaining: 1m 7s
10:	learn: 0.5425520	total: 8.36s	remaining: 1m 7s
11:	learn: 0.5374270	total: 9.04s	remaining: 1m 6s
12:	learn: 0.5302918	total: 9.79s	remaining: 1m 5s
13:	learn: 0.5197620	total: 10.4s	remaining: 1m 4s
14:	learn: 0.5155247	total: 11s	remaining: 1m 2s
15:	learn: 0.5106465	total: 11.7s	remaining: 1m 1s
16:	learn: 0.5047827	total: 12.3s	remaining: 1m
17:	learn: 0.4952802	total: 13s	remaining: 59s
18:	learn: 0.4896470	total: 13.7s	remaining: 58.5s
19:	learn: 0.4780154	t

62:	learn: 0.2547655	total: 45.5s	remaining: 26.7s
63:	learn: 0.2532133	total: 46s	remaining: 25.9s
64:	learn: 0.2512954	total: 46.7s	remaining: 25.1s
65:	learn: 0.2499573	total: 47.4s	remaining: 24.4s
66:	learn: 0.2462821	total: 48.1s	remaining: 23.7s
67:	learn: 0.2454267	total: 48.7s	remaining: 22.9s
68:	learn: 0.2430993	total: 49.4s	remaining: 22.2s
69:	learn: 0.2415570	total: 50.1s	remaining: 21.5s
70:	learn: 0.2398205	total: 50.8s	remaining: 20.7s
71:	learn: 0.2360458	total: 51.4s	remaining: 20s
72:	learn: 0.2330051	total: 52.1s	remaining: 19.3s
73:	learn: 0.2321253	total: 52.8s	remaining: 18.5s
74:	learn: 0.2278213	total: 53.5s	remaining: 17.8s
75:	learn: 0.2263960	total: 54.1s	remaining: 17.1s
76:	learn: 0.2255254	total: 54.7s	remaining: 16.3s
77:	learn: 0.2212046	total: 55.3s	remaining: 15.6s
78:	learn: 0.2188028	total: 56s	remaining: 14.9s
79:	learn: 0.2167146	total: 56.7s	remaining: 14.2s
80:	learn: 0.2161038	total: 57.3s	remaining: 13.4s
81:	learn: 0.2135779	total: 57.9s	rem

24:	learn: 0.4223960	total: 18.3s	remaining: 54.8s
25:	learn: 0.4110480	total: 19s	remaining: 54s
26:	learn: 0.4013853	total: 19.6s	remaining: 53.1s
27:	learn: 0.3954669	total: 20.3s	remaining: 52.3s
28:	learn: 0.3809666	total: 21.1s	remaining: 51.7s
29:	learn: 0.3786347	total: 21.8s	remaining: 50.8s
30:	learn: 0.3714637	total: 22.4s	remaining: 50s
31:	learn: 0.3628323	total: 23.2s	remaining: 49.2s
32:	learn: 0.3565521	total: 23.8s	remaining: 48.4s
33:	learn: 0.3504441	total: 24.4s	remaining: 47.4s
34:	learn: 0.3454462	total: 25s	remaining: 46.4s
35:	learn: 0.3408817	total: 25.6s	remaining: 45.6s
36:	learn: 0.3396113	total: 26.3s	remaining: 44.8s
37:	learn: 0.3382656	total: 26.9s	remaining: 43.9s
38:	learn: 0.3366654	total: 27.6s	remaining: 43.2s
39:	learn: 0.3321549	total: 28.2s	remaining: 42.4s
40:	learn: 0.3261214	total: 28.9s	remaining: 41.5s
41:	learn: 0.3206507	total: 29.6s	remaining: 40.9s
42:	learn: 0.3141157	total: 30.2s	remaining: 40s
43:	learn: 0.3121350	total: 30.9s	remaini

86:	learn: 0.1962429	total: 1m 38s	remaining: 14.7s
87:	learn: 0.1932555	total: 1m 39s	remaining: 13.6s
88:	learn: 0.1914235	total: 1m 40s	remaining: 12.4s
89:	learn: 0.1892403	total: 1m 41s	remaining: 11.3s
90:	learn: 0.1883797	total: 1m 42s	remaining: 10.1s
91:	learn: 0.1873010	total: 1m 43s	remaining: 8.97s
92:	learn: 0.1870717	total: 1m 43s	remaining: 7.83s
93:	learn: 0.1864175	total: 1m 44s	remaining: 6.69s
94:	learn: 0.1854578	total: 1m 45s	remaining: 5.55s
95:	learn: 0.1845976	total: 1m 46s	remaining: 4.43s
96:	learn: 0.1839029	total: 1m 46s	remaining: 3.31s
97:	learn: 0.1834249	total: 1m 47s	remaining: 2.19s
98:	learn: 0.1831782	total: 1m 48s	remaining: 1.09s
99:	learn: 0.1829576	total: 1m 48s	remaining: 0us
Mean accuracy with no stem and no tag = 78.60000000000001 %
