In [81]:
import os.path as op
import numpy as np
import string

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

#import nltk
from nltk import SnowballStemmer
from nltk import pos_tag

In [82]:
# Load data
print("Loading dataset")

from glob import glob
filenames_neg = sorted(glob(op.join('..', 'data', 'imdb1', 'neg', '*.txt')))
filenames_pos = sorted(glob(op.join('..', 'data', 'imdb1', 'pos', '*.txt')))
filenames_stop_words = sorted(glob(op.join('..','data','english.stop')))


stop_words_list = [open(f).read() for f in filenames_stop_words][0].split()

texts_neg = [open(f).read() for f in filenames_neg]
texts_pos = [open(f).read() for f in filenames_pos]
texts = texts_neg + texts_pos
y = np.ones(len(texts), dtype=np.int)
y[:len(texts_neg)] = 0.

print("%d documents" % len(texts))
print("punctuation = ", string.punctuation)

Loading dataset
2000 documents
punctuation =  !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [168]:
### Preprocess

def text_process(texts, stem = False, tag = False):
    """
        Takes in a list of texts, then performs the following:
        1. Remove all punctuation for each text
        2. Remove all stopwords for each text
        3. Returns the cleaned texts
        
        if stem == True, change each word to its stem.
        if tag == True, we'll keep only noun, adverbs, verbs and adjectives.
    """
    cleaned_texts = []
    keep_cat = ['NN','ADV','VB','JJ']
    for text in texts :
        list_text = []
        # Check characters to see if they are in punctuation
        nopunc = [char for char in text if char not in string.punctuation]
    
        # Join the characters again to form the string.
        nopunc = ''.join(nopunc)
    
        # Now just remove any stopwords and stem words
        list_words = nopunc.split()
        if stem == False :
            if tag == False :
                list_text = [word for word in list_words if word.lower() not in stop_words_list] #Question 2
            else :
                for word in nopunc.split() :
                    print(pos_tag([stemmer.stem(word)]))
                list_text = [word for word in list_words if word.lower() not in stop_words_list] #Question 4
                #print(list_text)
        else :
            if tag == False :
                list_text = [stemmer.stem(word) for word in list_words if word.lower() not in stop_words_list] #Question 3
            else :
                list_text = [stemmer.stem(word) for word in list_words if (word.lower() not in stop_words_list and pos_tag([word])[0][1] not in keep_cat)] #Question 3
                
                    
        new_text = ' '.join(list_text)
        cleaned_texts.append(new_text)
       
    return cleaned_texts


In [169]:
stemmer = SnowballStemmer("english")
vectorizer = CountVectorizer()

In [170]:
texts_processed = text_process(texts, stem = False, tag = False)
X = vectorizer.fit_transform(texts_processed)

In [171]:
texts_processed_stem = text_process(texts, stem = True, tag = False)
X_stem = vectorizer.fit_transform(texts_processed_stem)

In [172]:
texts_processed_stem_tag = text_process(texts, stem = True, tag = True)
X_stem_tag = vectorizer.fit_transform(texts_processed_stem_tag)

In [173]:
print(X.shape)
print(X_stem.shape)
print(X_stem_tag.shape)

(2000, 47026)
(2000, 31375)
(2000, 12700)


In [175]:
#Naive Bayesien
from sklearn.naive_bayes import MultinomialNB
print("Testing Naive Bayesien with cross validation 5-folds: ")
 

clf = MultinomialNB()

print("Mean accuracy with no stem and no tag = {acc} %", 100*cross_val_score(clf,X,y, cv = 5).mean())
    
print("Mean accuracy with stem and no tag = {acc} %", 100*cross_val_score(clf,X_stem,y, cv = 5).mean())
   
print("Mean accuracy with stem and tag = {acc} %", 100*cross_val_score(clf,X_stem_tag,y, cv = 5).mean())


Testing Naive Bayesien with cross validation 5-folds: 
Mean accuracy with no stem and no tag = {acc} % 80.74999999999999
Mean accuracy with stem and no tag = {acc} % 80.4
Mean accuracy with stem and tag = {acc} % 76.55000000000001


In [79]:
# Linear SVC
from sklearn.svm import LinearSVC
print("Testing Linear SVC : ")



for c in [0.1, 0.5, 1, 1.5, 3] :
    clf = LinearSVC(C = c)
    print(cross_val_score(clf,X,y, cv = 5).mean())

clf = LinearSVC(C = 0.1)
print(cross_val_score(clf,X_stem,y, cv = 5).mean())

Testing Linear SVC : 
0.825
0.8240000000000001
0.8225
0.8215
0.8220000000000001
0.8164999999999999


In [78]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
print("Testing LogisticRegression : ")
clf = LogisticRegression()
print(cross_val_score(clf,X,y, cv = 5).mean())
print(cross_val_score(clf,X_stem,y, cv = 5).mean())

Testing LogisticRegression : 
0.8305
0.825
