In [None]:
import os
import nltk
import numpy as np
import operator
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import naive_bayes
from sklearn.metrics import accuracy_score

In [None]:
def read_corpus(dataset):
    corpus = []
    labels = []
    for rev in ['pos', 'neg']:
        for file in os.listdir('aclImdb/' + dataset + '/'+ rev + '/'):
            file_path = 'aclImdb/' + dataset + '/'+ rev + '/' + file
            with open(file_path, 'r') as f:
                corpus.append(f.read())
                if rev == 'pos':
                    labels.append(1)
                else:
                    labels.append(0)
    return corpus, labels

In [None]:
def run_ml_model(model, vectorizer, corpus_train, corpus_test, y_train, y_test):
    corpus = corpus_train + corpus_test
    X = vectorizer.fit_transform(corpus)

    X_train = X[:25000]
    X_test = X[25000:]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    coeffs = model.coef_[0]

    return accuracy_score(y_pred, y_test), coeffs

Let's load our training and test corpora.

In [None]:
corpus_train, y_train = read_corpus('train')
corpus_test, y_test = read_corpus('test')

In the first attempt at text classification, let's use logistic regression and the bag of words approach.

In [None]:
model = LogisticRegression()
vectorizer = CountVectorizer()

We can get the model accuracy and the associated weights by running the `run_ml_model` function. See function definition above for what it does.

In [None]:
accuracy, weights = run_ml_model(model, vectorizer, corpus_train, corpus_test, y_train, y_test)

In [None]:
accuracy

What are the words that are the strongest predictors of positive and negative reviews?

In [None]:
operator.itemgetter(*np.argsort(weights))(vectorizer.get_feature_names())[:20]

In [None]:
operator.itemgetter(*np.argsort(weights))(vectorizer.get_feature_names())[-20:]

*On your own*: run a range of machine learning models to find the best performing one. In most cases you'll only have to modify the `model` and/or the `vectorizer`.

a) Use n-grams instead of the bag of words approach.

In [None]:
# TODO

b) Set limits on minimum and maximum document frequency. E.g. use only words if they appear in at least 5 reviews; remove words that appear in more than 95% of reviews.

In [None]:
# TODO

c) Use TF-IDF instead of the bag of words or n-grams approach.

In [None]:
vectorizer = TfidfVectorizer()

# TODO

d) Remove stop words (most commonly used words in a language).

In [None]:
from nltk.corpus import stopwords

# TODO

e) Use stemming/lemmatization.

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

# TODO

f) Use Naive Bayes model instead of Linear Regression

In [None]:
model = naive_bayes.MultinomialNB()

# TODO