In [24]:
import os
from sklearn.model_selection import train_test_split

pos_examples = [open('../data/train/pos/' + f).read() for f in os.listdir('../data/train/pos')]
neg_examples = [open('../data/train/neg/' + f).read() for f in os.listdir('../data/train/neg')]

X = pos_examples + neg_examples
y = [1 if i < len(pos_examples) else 0 for i in range(len(pos_examples) + len(neg_examples))]

X_train, X_validate, y_train, y_validate = train_test_split(X, y, train_size=0.8, test_size=0.2)


In [89]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

vectorizer = CountVectorizer(binary=False, analyzer='word', ngram_range=(1, 2)).fit(X_train)

training_features = vectorizer.transform(X_train)
validating_features = vectorizer.transform(X_validate)

sum_words = training_features.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

print(len(vectorizer.get_feature_names()))
print(words_freq[:100])


1384170
[('movi', 40162), ('film', 38285), ('one', 21849), ('like', 17694), ('it', 16081), ('time', 12506), ('good', 12291), ('thi', 12065), ('make', 11756), ('charact', 11262), ('get', 11217), ('watch', 11154), ('see', 11139), ('would', 10750), ('even', 10329), ('stori', 10261), ('realli', 9309), ('well', 8714), ('scene', 8313), ('look', 8007), ('much', 7852), ('show', 7760), ('end', 7641), ('could', 7521), ('bad', 7477), ('peopl', 7414), ('go', 7366), ('great', 7266), ('also', 7239), ('first', 7237), ('love', 7184), ('think', 7087), ('way', 7041), ('play', 6997), ('act', 6983), ('made', 6579), ('thing', 6506), ('know', 5956), ('say', 5938), ('seem', 5754), ('work', 5679), ('come', 5543), ('th', 5529), ('plot', 5515), ('two', 5512), ('in', 5492), ('actor', 5410), ('year', 5380), ('seen', 5313), ('mani', 5289), ('want', 5275), ('take', 5196), ('never', 5187), ('littl', 5127), ('best', 5117), ('life', 5075), ('tri', 4999), ('man', 4843), ('ever', 4784), ('better', 4606), ('give', 4570),

In [26]:
from naive_bayes import BernoulliNaiveBayes
from sklearn.metrics import classification_report, accuracy_score

#### EXPERIMENT 0: Bernoulli vs. Multinomial Naive Bayes with and without stop words

In [90]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer

def clean(X, params={'lemmatize': True, 'stem': True}):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    X_clean = []
    negative = set(['not', 'isnt', 'no', 'n\'t', 'never', 'can\'t', 'won\'t', 'don\'t', 'havn\'t', 'didn\'t', 'hasn\'t', 'wouldn\'t', 'couldn\'t', 'shouldn\'t'])
    for x_i in X:
        x_i = re.sub(r'<.*?>', '', x_i)
        tokens = word_tokenize(x_i)
        clean = [w for w in tokens if w not in stop_words]
        if params['stem'] and not params['lemmatize']:
            clean = [stemmer.stem(w) for w in clean]
        if params['lemmatize'] and not params['stem']:
            clean = [lemmatizer.lemmatize(w) for w in clean]
        if params['stem'] and params['lemmatize']:
            clean = [lemmatizer.lemmatize(stemmer.stem(w)) for w in clean]
        negated = ['not_' + clean[i] if clean[i-1] in negative else clean[i] for i in range(len(clean))]
        remove_negative = [w for w in negated if w not in negative]
        clean_2 = " ".join(remove_negative)
        X_clean.append(' '.join(clean))
    return X_clean

#X_train, X_validate = clean(X_train), clean(X_validate)

num_words = vectorizer.vocabulary_.__len__()/2

for x in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]:
    print('USING {}% of most frequent word features.'.format(x*100))
    features = int(num_words*x)
    print(features)
    vectorizer = CountVectorizer(binary=False, analyzer='word', max_features = features, stop_words=None, ngram_range=(1, 2))
    training_features = vectorizer.fit_transform(X_train)
    tfidf = TfidfTransformer()
    normalizer = Normalizer()
    train_tfidf = normalizer.fit_transform(tfidf.fit_transform(training_features))
    validating_features = vectorizer.transform(X_validate)
    validate_tfidf = normalizer.transform(tfidf.transform(validating_features))
    model_lr = LogisticRegression(C=10)
    model_lr.fit(train_tfidf, y_train)
    predictions = model_lr.predict(validate_tfidf)
    accuracy = accuracy_score(y_validate, predictions)
    model_multi_nb = MultinomialNB()
    model_multi_nb.fit(train_tfidf, y_train)
    predictions_2 = model_multi_nb.predict(validate_tfidf)
    accuracy_2 = accuracy_score(y_validate, predictions_2)
    
    results['bernoulli'].append(accuracy_2)
    results['multinomial'].append(accuracy_2)
    
    print('LR: {}\tMultinomial: {}'.format(accuracy, accuracy_2))

USING 5.0% of most frequent word features.
34604




LR: 0.899	Multinomial: 0.8828
USING 10.0% of most frequent word features.
69208




LR: 0.9004	Multinomial: 0.8846
USING 15.0% of most frequent word features.
103812




LR: 0.8998	Multinomial: 0.8864
USING 20.0% of most frequent word features.
138417




LR: 0.9018	Multinomial: 0.8846
USING 25.0% of most frequent word features.
173021




LR: 0.902	Multinomial: 0.8852
USING 30.0% of most frequent word features.
207625




LR: 0.902	Multinomial: 0.8846
USING 35.0% of most frequent word features.
242229




LR: 0.903	Multinomial: 0.8852
USING 40.0% of most frequent word features.
276834




LR: 0.9018	Multinomial: 0.8854
USING 45.0% of most frequent word features.
311438




LR: 0.9016	Multinomial: 0.8856
USING 50.0% of most frequent word features.
346042




LR: 0.9014	Multinomial: 0.8858


#### EXPERIMENT 1: Stemming vs. lemmatizing and removal of html tags

In [44]:
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords


for params in [(False, False), (True, False), (False, True), (True, True)]:
    clean_params = {
        'lemmatize': params[0],
        'stem': params[1]
    }
    X_train_clean, X_validate_clean = clean(X_train, clean_params), clean(X_validate, clean_params)
    vectorizer = CountVectorizer(binary=False, analyzer='word')
    t_features = vectorizer.fit_transform(X_train_clean)
    v_features = vectorizer.transform(X_validate_clean)
    
    print('NUM FEATURES: {}'.format(vectorizer.vocabulary_.__len__()))
    
    model_bernoulli_nb = BernoulliNB()
    model_bernoulli_nb.fit(t_features, y_train)
    predictions = model_bernoulli_nb.predict(v_features)
    accuracy = accuracy_score(y_validate, predictions)
    
    model_multi_nb = MultinomialNB()
    model_multi_nb.fit(t_features, y_train)
    predictions_2 = model_multi_nb.predict(v_features)
    accuracy_2 = accuracy_score(y_validate, predictions_2)
    
    print('BERNOULLI PARAMS: {} ACCURACY: {}'.format(clean_params, accuracy))
    print('MULTINOMIAL PARAMS: {} ACCURACY: {}'.format(clean_params, accuracy_2))
    
    


NUM FEATURES: 69267
BERNOULLI PARAMS: {'lemmatize': False, 'stem': False} ACCURACY: 0.854
MULTINOMIAL PARAMS: {'lemmatize': False, 'stem': False} ACCURACY: 0.8612
NUM FEATURES: 65460
BERNOULLI PARAMS: {'lemmatize': True, 'stem': False} ACCURACY: 0.8548
MULTINOMIAL PARAMS: {'lemmatize': True, 'stem': False} ACCURACY: 0.8616
NUM FEATURES: 53407
BERNOULLI PARAMS: {'lemmatize': False, 'stem': True} ACCURACY: 0.8514
MULTINOMIAL PARAMS: {'lemmatize': False, 'stem': True} ACCURACY: 0.8628
NUM FEATURES: 53296
BERNOULLI PARAMS: {'lemmatize': True, 'stem': True} ACCURACY: 0.8522
MULTINOMIAL PARAMS: {'lemmatize': True, 'stem': True} ACCURACY: 0.8622


#### EXPERIMENT 2