In [1]:
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop = stopwords.words('english')
porter = PorterStemmer()

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with open(path, 'r') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [2]:
next(stream_docs(path='./movie_data.csv'))

('"Just watched this movie over the weekend, and I must say I thoroughly enjoyed it. The 2 Italo American actors are excellent as usual (Michael Imperioli and John Ventimiglia). It is obvious that the director was influenced by 2 great films of the past directed by Italians. Primarily he was influenced by Dino Risi and his film IL SORPASSO. It is the story of 2 young men who meet by chance and become friends. One is extroverted and the other is introverted. They enjoy the whole day together and by the end of the day, the shy one learns that there is more to life than his usual routine monotony. The same thing happens to Albert De Santi. Unfortunately, IL SORPASSO has a very similar ending and this apparently influenced the director of ON THE RUN because he uses the same technique but with a twist. I had expected something but was surprised to see that it turned out to be the opposite. If you watch both movies you will understand. The other film that influenced the director is AFTER HOU

In [3]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [11]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')


In [12]:
import pyprind
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:38


In [6]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.868


In [7]:
clf = clf.partial_fit(X_test, y_test)

## Serializing fitted scikit-learn estimators

After we trained the logistic regression model as shown above, we know save the classifier along woth the stop words, Porter Stemmer, and HashingVectorizer as serialized objects to our local disk so that we can use the fitted classifier in our web application later.

In [10]:
import pickle
import os

dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)   
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)

Using the pickle.dump method we serialized the classifier object to save its current state so we don't have to train the classifier everytime, here we also serialized the stop word set from the NLTK library so that we don't have to install the NLTK vocabulary on our server.

We don't need to pickle the HashingVectorizer , since it does not need to be fitted.
Instead, we can create a new Python script file, from which we can import the
vectorizer into our current Python session.