# 8. Applying Machine Learning to Sentiment Analysis



In [6]:
import pandas as pd
import os, sys
from tqdm.notebook import tqdm

BASEPATH = "data/raw/aclImdb"
labels = {"pos": 1, "neg": 0}

df = pd.DataFrame()

data = {"review": [], "sentiment": []}

for s in ("test", "train"):
    for l in ("pos", "neg"):
        path = os.path.join(BASEPATH, s, l)
        for file in tqdm(sorted(os.listdir(path)), desc="Getting files"):
            with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
                txt = infile.read()
                data["review"].append(txt)
                data["sentiment"].append(labels[l])
            pbar.update()

df = pd.DataFrame(data)
df.head(1)


Getting files:   0%|          | 0/12500 [00:00<?, ?it/s]

Getting files:   0%|          | 0/12500 [00:00<?, ?it/s]

Getting files:   0%|          | 0/12500 [00:00<?, ?it/s]

Getting files:   0%|          | 0/12500 [00:00<?, ?it/s]

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1


In [7]:
import torch
import pandas as pd
import numpy as np

df_original = pd.read_csv("data/raw/imdb_reviews.csv")
df = df_original.sample(frac=1, ignore_index=True)
df.head()

Unnamed: 0,review,sentiment
0,Wrestlemania 14 is not often looked as one of ...,1
1,"It's hard to rate films like this, because do ...",1
2,Homicide: The Movie proved to be a good wrap-u...,1
3,"I did it too. When i first saw the band, i dis...",1
4,If You can watch a film without worrying about...,0


In [9]:
import re


def preprocessor(text):
    text = re.sub("<[^>]*>", "", text)
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text)
    text = (re.sub("[\W]+", " ", text.lower())) + " ".join(emoticons).replace("-", "")
    return text


reviews_processed = [preprocessor(review) for review in df["review"].values]

df["review"] = reviews_processed
df.head()

Unnamed: 0,review,sentiment
0,wrestlemania 14 is not often looked as one of ...,1
1,it s hard to rate films like this because do y...,1
2,homicide the movie proved to be a good wrap up...,1
3,i did it too when i first saw the band i dismi...,1
4,if you can watch a film without worrying about...,0


In [10]:
X_train = df["review"].iloc[:25_000]
y_train = df["sentiment"].iloc[:25_000]

X_test = df["review"].iloc[25_000:]
y_test = df["sentiment"].iloc[25_000:]

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

log_reg = LogisticRegression()
X_train_processed = CountVectorizer().fit_transform(X_train[:10_000])
log_reg.fit(X_train_processed[:10_000], y_train[:10_000])
log_reg.score(X_train_processed[:10_000], y_train[:10_000])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9996

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


def tokenizer_porter(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in text.split()]


def tokenizer(text):
    return text.split()


stop_words = stopwords.words("english")

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

small_param_grid = [
    {
        "vect__ngram_range": [(1, 1)],
        "vect__stop_words": [None],
        "vect__tokenizer": [tokenizer, tokenizer_porter],
        "clf__penalty": ["l2"],
        "clf__C": [1.0, 10.0],
    },
    {
        "vect__ngram_range": [(1, 1)],
        "vect__stop_words": [stop_words, None],
        "vect__tokenizer": [tokenizer, tokenizer_porter],
        "clf__penalty": ["l2"],
        "clf__C": [1.0, 10.0],
    },
]

lr_tfidf = Pipeline([("vect", tfidf), ("clf", LogisticRegression(solver="liblinear"))])

gs_lr_tfidf = GridSearchCV(
    estimator=lr_tfidf, param_grid=small_param_grid, cv=4, verbose=2, n_jobs=-1
)

gs_lr_tfidf.fit(X_train, y_train)

print(f"Best Parameters: {gs_lr_tfidf.best_params_}")
print(f"Best Training Score: {gs_lr_tfidf.best_estimator_.score(X_train, y_train)}")
print(f"Best Test Score: {gs_lr_tfidf.best_estimator_.score(X_test, y_test)}")


Fitting 2 folds for each of 12 candidates, totalling 24 fits
[CV 1/2] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f952ab92670>;, score=0.883 total time=   7.2s
[CV 2/2] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f8dfb472670>;, score=0.886 total time=   7.7s
[CV 2/2] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7effdb63a670>;, score=0.875 total time=   8.1s
[CV 1/2] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fd23df02670>;, score=0.888 total time=   8.1s




[CV 1/2] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most'



[CV 1/2] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x7f51cea7f670>;, score=0.881 total time= 4.7min




[CV 2/2] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most'

# Online Algorithms and Out-of-Core Learning

In [18]:
import numpy as np
import re
from nltk.corpus import stopwords

stop_words = stopwords.words("english")


def tokenizer(text):
    text = re.sub("<[^>]*>", "", text)
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text.lower())
    text = re.sub("[\W]+", " ", text.lower()) + " ".join(emoticons).replace("-", "")
    tokenized = [w for w in text.split() if w not in stop_words]
    return tokenized


def stream_docs(path):
    with open(path, "r", encoding="utf-8") as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label


def get_mini_batch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None

    return docs, y


from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(
    decode_error="ignore", n_features=2**21, preprocessor=None, tokenizer=tokenizer
)

clf = SGDClassifier(loss="log", random_state=42)
doc_stream = stream_docs(path="data/raw/imdb_reviews.csv")

import pyprind

pbar = pyprind.ProgBar(45,stream=sys.stdout)

classes = np.array([0, 1])

for _ in range(45):
    X_train, y_train = get_mini_batch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

X_test, y_test = get_mini_batch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print(f"Accuracy: {clf.score(X_test, y_test)}")


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:26


# Topic Modelling with Latent Drichlet Allocation

