In [None]:
# loading the data into a frame
import pyprind 
import pandas as pd 
import os 

pbar = pyprind.ProgBar(50000)
labels = {'pos':1, 'neg':0}
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = '/users/toul/downloads/aclImdb/%s/%s' % (s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r') as infile:
                txt=infile.read()
                df=df.append([[txt, labels[l]]], ignore_index=True)
                pbar.update()

df.columns = ['review', 'sentiment']

In [None]:
# shuffling the data frame using the permutation function and checking to see if it has been properly loaded 
import numpy as np 
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index=False)
df.head(3)

In [None]:
# creating a bag-of-words model: allows us to represent text as numerical feature vectors. 
# 1. We create a vocabulary of unique tokens--for example, words--from the entire set of documents 
# 2. We construct a feature vector from each document that contains the counts of how often each word occurs in 
# in the particular document 
# will consist of mostly zeros hence the name sparse vector 


In [None]:
# TRANSFORMING WORDS INTO FEATURE VECTORS 
# stores the data into a dictionary
import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer 
count = CountVectorizer() 
docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'THe sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)

In [None]:
# checking 
print(count.vocabulary_)

In [None]:
print(bag.toarray())

In [None]:
# raw term frequencies : tf(t,d) - The number of times a term t occurs in a document d 
# Also, known as the 1-gram or unigram model where each item or token in the voc. represents a single word 
# n-gram : contigous sequence of items in NLP; words, letters, or symbols 
# n-gram = 4 || 3 => good for spam filtering messages 
# 1-gram: "the", "sun", "is", "shining"
# 2-gram: "the sun", "sun is", "is shining" => ngram_range(2,2)



In [None]:
# ASSESSING WORD RELEVANCY VIA TERM FREQUENCY-INVERSE DOCUMENT FREQUENCY 
# term frequency-inverse document frequency; downweights frequently occurring words in the feature vectors 
# defined as the product of term frequency and inverse document frequency 

from sklearn.feature_extraction.text import TfidfTransformer 
tfidf = TfidfTransformer() 
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

In [None]:
# Cleaning Text Data 
# stripping all of the unwanted characters from the document
df.loc[0, 'review'][-50:]

In [None]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [None]:
# in general it isn't good to use regex to parse HTML mark-up 
# checking that the preprocessor works 

preprocessor(df.loc[0, 'review'][-50:])

In [None]:
df['review'] = df['review'].apply(preprocessor)

In [None]:
# PROCESSING DOCUMENTS INTO TOKENS 
# splitting the words apart 
# word-stemming: The process of transforming a word from its root form  aka the Porter stemmer NLTK
def tokenizer(text):
    return text.split()

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')

In [None]:
# read http://www.nltk.org/book/ is a book about the algorithm 
# other popular algorithms; Snowball stemmer and Lancaster 

# stop-word removal: words that are extremely common in all sorts of txts 
import nltk 
nltk.download('stopwords')
# load and apply the english stopword

from nltk.corpus import stopwords 
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]


In [None]:
# TRAINING A LOGISTIC REGRESSION MODEL FOR DOCUMENT CLASSIFICATION 

#training dataset
x_train = df.loc[:25000, 'review'].values 
y_train = df.loc[:25000, 'sentiment'].values 
# test dataset
x_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:,'sentiment'].values


In [None]:
# using GridSearchCV to find the optimal set of parameters fro the LR model uning 5-fold stratified cross-validation 
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer 
tfidf = TfidfVectorizer(strip_accents = None, lowercase=False, preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)], 
              'vect__stop_words': [stop, None], 
              'vect__tokenizer': [tokenizer, tokenizer_porter],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [1.0, 10.0, 100.0]}, 
              {'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf': [False],
               'vect__norm': [None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]}
             ]
lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
gs_lr_tfidf.fit(x_train, y_train)

In [None]:
# printing the best parameter set
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

In [None]:
# the best model allows for the avg. 5-fold-cross-validation accuracy score on the training set 
# and the classification accuracy on the test dataset

print('CV accuracy: %.3f' % gs_lr_tfidf.best_score_)


In [None]:
clf = gs_lr_tfidf.best_estimator_ 
print('Test accuracy: %.3f' % clf.score(x_test, y_test))

In [None]:
# WORKING WITH BIGGER DATA--ONLINE ALGORITHMS AND OUT-OF-CORE LEARNING 
# streaming the movie data from the local drive and then training an LR model 

import numpy as np 
import re 
from nltk.corpus import stopwords 
stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized 

# generator function 
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip the header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label
            


In [None]:
# checking stream docs function 
next(stream_docs(path='./movie_data.csv'))

In [None]:
# getting mini-batches 
def get_minibatch(doc_stream, size):
    docs, y = [], [] 
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y 
        
# not using CountVectorizer b/c it requires holding the whole set of vocabulary in memory 
# hashing vector 32-bit MurmurHash3 
from sklearn.feature_extraction.text import HashingVectorizer 
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')

In [None]:
# now using out-of-the core (CPU?)
# very memory efficient and fast
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    x_train, y_train = get_minibatch(doc_stream, size=1000)
    if not x_train:
        break
    x_train = vect.transform(x_train)
    clf.partial_fit(x_train, y_train, classes=classes)
    pbar.update()
    

In [None]:
x_test, y_test = get_minibatch(doc_stream, size=5000)
x_test = vect.transform(x_test)
print('Accuracy: %.3f' % clf.score(x_test, y_test))

In [None]:
clf = clf.partial_fit(x_test, y_test)

In [None]:
# common extension of the bag-of-words model is the Latent Dirichlet Allocation 
# another one is the word2vec by google 



In [None]:
# EMBEDDING A MACHINE LEARING MODEL INTO A WEB APPLICATION 
# used for spam detection in submission forms, search engines, recommendation systems for media or shopping portals
# and many more


In [None]:
# SERIALIZING FITTED SCIKIT-LEARN ESTIMATORS 
# Model persistence => pickle library

import pickle 
import os 
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)

# created a movie directory
# pkl_objects => subdirectory to save serialized Python to the local drive.
# can sue the joblib library as well 


In [None]:
import pickle
import re
import os 
from vectorizer import vect 
clf = pickle.load(open(os.path.join('pkl_objects', 'classifier.pkl'), 'rb'))