# NLP with the "Hello World" dataset IMDb

This notebook implements logistic regression and SGD to train the classification of sentiment of movie reviews.

Both models were saved in pickle, so they can later be used to predict unlabeled data, that I will do latar by building a Flask app.

In [18]:
import pyprind
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import re
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer


In [None]:
# Extract reviews to a csv file

basepath = '/Users/pedrojunqueira/Downloads/aclImdb'

In [None]:
labels = {'pos':1,'neg':0}

In [None]:
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test','train'):
    for l in ('pos','neg'):
        path = os.path.join(basepath,s,l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path,file),
                     'r',encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]],
                          ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

In [None]:
# export this to csv and shufle the rows

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('/Users/pedrojunqueira/Google Drive/Documents/#18 DataMaster/Jupyter Projects/movie_data.csv', 
          index=False, encoding='utf-8')

In [11]:
# read from csv
df = pd.read_csv('../movie_data.csv')


In [12]:
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [None]:
# data clening
df.loc[0, 'review'][-50:]

In [13]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
' '.join(emoticons).replace('-', ''))
    return text

In [14]:
# test function
preprocessor(df.loc[0, 'review'][-50:])

'is seven title brazil not available'

In [15]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [16]:
df['review'] = df['review'].apply(preprocessor)

In [None]:
df.review[:10]

In [None]:
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [None]:
# tokenize root words
tokenizer_porter('runners like running and thus they run')

In [None]:
# nltk stop words
stop = stopwords.words('english')

In [None]:
nltk.download('stopwords')

In [None]:
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

In [None]:
# process and train the data in batches
# create a tokenizer function
stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized
    
    

In [None]:
# generator for stream docs

def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label
            

In [None]:
# test generator
next(stream_docs(path='../movie_data.csv'))

In [None]:
# mini batch function
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y
    

In [None]:
# train utilising and SGD and updating weights in batches


vect = HashingVectorizer(decode_error='ignore',
n_features=2**21, preprocessor=None, tokenizer=tokenizer)

clf = SGDClassifier(loss='log', random_state=1)


In [None]:
doc_stream = stream_docs(path='../movie_data.csv')

In [None]:
# process the batches
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()
    

In [None]:
# create text set
X_test, y_test = get_minibatch(doc_stream, size=5000)
# vectorize test input
X_test = vect.transform(X_test)

In [None]:
# results
print(f'Accuracy: {clf.score(X_test, y_test)}')

In [None]:
# test with unlabeled data
label = {0:'negative', 1:'positive'}
example = ['I love this movie is amazing']
X = vect.transform(example)

In [None]:
print(f'prediction: {label[clf.predict(X)[0]]}\nprobability: {clf.predict_proba(X).max()*100:.2f}%')

In [None]:
dest = os.path.join('pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)

In [2]:
cur_dir = os.getcwd()
stop = pickle.load(open(os.path.join(cur_dir, 'pkl_objects', 'stopwords.pkl'), 'rb'))
clf = pickle.load(open(os.path.join('pkl_objects', 'classifier.pkl'),'rb'))
#loaded_model = pickle.load(open(os.path.join('pkl_objects', 'classifier.pkl'),'rb'))

In [3]:
# Run the pre process functions
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

vect = HashingVectorizer(decode_error='ignore',
n_features=2**21, preprocessor=None, tokenizer=tokenizer)

# test with unlabeled data and pre process data
label = {0:'negative', 1:'positive'}
example = ['I love this movie is amazing']
X = vect.transform(example)

In [4]:
clf.predict(X)[0]

1

In [5]:
clf.predict_proba(X).max()

0.9588732697403057

In [6]:
# test with unlabeled data and pre process data
label = {0:'negative', 1:'positive'}
example = ['I do not like this movie']
X = vect.transform(example)

# train the full dataset. It may take around 45 minutes to run all the grid search and optimisation




In [25]:
# Create dataset for input Train and Test

X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values




In [20]:
# create fucntion tokenizer porter

porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [23]:
# prepare the pipeline processors

# tfidf vectorise to transform text in vector based on their frequency on sentence and corpus
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False,
                        preprocessor=None)


# define the grid of hyperparameter to validate the best model
param_grid = [ {'vect__ngram_range': [(1,1)], 'vect__stop_words': [stop, None],
                'vect__tokenizer': [tokenizer, tokenizer_porter],
                'clf__penalty': ['l1', 'l2'],
                'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop, None], 'vect__tokenizer': [tokenizer,tokenizer_porter], 
               'vect__use_idf':[False],
               'vect__norm':[None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}]



# build the pipeline of the logistic regression model
# first transform the vector then train on different hyperparameters

lr_tfidf = Pipeline([('vect', tfidf), ('clf',
                                       LogisticRegression(random_state=0, solver='liblinear'))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy',
                           cv=5, verbose=2, n_jobs= -1)




In [26]:
# Train model

gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 17.5min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 81.8min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 719.8min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                         

In [30]:
gs_lr_tfidf.best_params_

{'clf__C': 10.0,
 'clf__penalty': 'l2',
 'vect__ngram_range': (1, 1),
 'vect__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'up',
  'd

In [31]:
gs_lr_tfidf.best_score_

0.8952441902323907

In [32]:
clf_full = gs_lr_tfidf.best_estimator_

In [33]:
clf_full.score(X_test,y_test)

0.89688

In [42]:
X = ['this movie is the best movie I ever saw']

In [43]:
clf_full.predict(X)[0]

1

In [44]:
clf_full.predict_proba(X).max()

0.9889758731868111

In [50]:
dest = os.path.join('pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(clf_full, open(os.path.join(dest, 'clf_full.pkl'), 'wb'), protocol=4)

In [51]:
clf_test = pickle.load(open(os.path.join('pkl_objects', 'clf_full.pkl'),'rb'))


In [65]:

print(f'{clf_test.predict_proba(X).max():.4f}')
print(f'{clf_test.predict(X)[0]}')


0.9890
1


In [67]:
clf_full == clf_test

True