In [6]:
# Preparing data

import pyprind
import pandas as pd
import os
import sys


# dataset: http://ai.stanford.edu/~amaas/data/sentiment/
img_align_celebaCe = '../datasets/aclImdb_v1/aclImdb'

In [12]:
pbar = pyprind.ProgBar(50000, stream=sys.stdout)

labels = {'pos': 1, 'neg': 0}
data = []

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(
                os.path.join(path, file),
                'r',
                encoding='utf-8'
            ) as infile:
                txt = infile.read()
            data.append([txt, labels[l]])
            pbar.update()

df = pd.DataFrame(data, columns=['review', 'sentiment'])

In [14]:
import numpy as np

np.random.seed(0)

df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [15]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

In [16]:
df.shape

In [19]:
# Create bag-of-words

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining, the weather is sweet,'
    'and one and one is two'
])
bag = count.fit_transform(docs)

print(count.vocabulary_)
print(bag.toarray())

In [22]:
# Word frequency and relevancy

from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(
    use_idf=True,
    norm='l2',
    smooth_idf=True
)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

In [21]:
# Cleaning the data

df.loc[0, 'review'][-50:]

In [26]:
import re


def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    return (
        re.sub('[\W]+', ' ', text.lower()) +
        ' '.join(emoticons).replace('-', '')
    )

In [30]:
print(preprocessor(df.loc[0, 'review'][-50:]))
print(preprocessor("</a>This :) is :( a test :-)!"))

In [31]:
df['review'] = df['review'].apply(preprocessor)

In [36]:
# Word stemming

def tokenizer(text):
    return text.split()


tokenizer('runners like running and thus they run')

from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]


tokenizer_porter('runners like running and thus they run')

In [37]:
import nltk

nltk.download('stopwords')

In [38]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
[
    w for w in 
    tokenizer_porter('a runner likes running and runs a lot')
    if w not in stop
]

In [39]:
# Training a logistic regression model

X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    strip_accents=None,
    lowercase=False,
    preprocessor=None
)

small_param_grid = [{
    'vect__ngram_range': [(1, 1)],
    'vect__stop_words': [None],
    'vect__tokenizer': [tokenizer, tokenizer_porter],
    'clf__penalty': ['l2'],
    'clf__C': [1.0, 10.0],
}, {
    'vect__ngram_range': [(1, 1)],
    'vect__stop_words': [stop, None],
    'vect__tokenizer': [tokenizer],
    'vect__use_idf':[False],
    'vect__norm':[None],
    'clf__penalty': ['l2'],
    'clf__C': [1.0, 10.0],
}]

lr_tfidf = Pipeline([
    ('vect', tfidf),
    ('clf', LogisticRegression(solver='liblinear'))
])

gs_lr_tfidf = GridSearchCV(
    lr_tfidf,
    small_param_grid,
    scoring='accuracy',
    cv=5,
    verbose=2,
    n_jobs=-1
)
gs_lr_tfidf.fit(X_train, y_train)

In [41]:
print(f'Best parameter set:: {gs_lr_tfidf.best_params_}')

In [42]:
print(f'CV Accuracy: {gs_lr_tfidf.best_score_:.3f}')

clf = gs_lr_tfidf.best_estimator_
print(f'Test Accuracy: {clf.score(X_test, y_test):.3f}')

In [43]:
# Out-of-core learning

import numpy as np
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')


def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall(
        '(?::|;|=)(?:-)?(?:\)|\(|D|P)',
        text
    )
    text = (
        re.sub('[\W]+', ' ', text.lower())
        + ' '.join(emoticons).replace('-', '')
    )
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [44]:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label


next(stream_docs(path='movie_data.csv'))

In [45]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [55]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(
    decode_error='ignore',
    n_features=2**21,
    preprocessor=None,
    tokenizer=tokenizer
)
clf = SGDClassifier(loss='log_loss', random_state=1)
doc_stream = stream_docs(path='movie_data.csv')

In [56]:
import pyprind

pbar = pyprind.ProgBar(45, stream=sys.stdout)
classes = np.array([0, 1])

for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
        
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

In [None]:
clf = clf.partial_fit(X_test, y_test)

In [60]:
# LDA

import pandas as pd

df = pd.read_csv('movie_data.csv', encoding='utf-8')
df = df.rename(columns={'0': 'review', '1': 'sentiment'})

In [61]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(
    stop_words='english',
    max_df=.1,
    max_features=5000
)
X = count.fit_transform(df['review'].values)

In [62]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(
    n_components=10,
    random_state=123,
    learning_method='batch',
    n_jobs=-1
)
X_topics = lda.fit_transform(X)

In [63]:
lda.components_.shape

In [64]:
n_top_words = 5
feature_names = count.get_feature_names_out()

for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {(topic_idx + 1)}:')
    print(
        ' '.join([feature_names[i]
        for i in topic.argsort()[:-n_top_words - 1:-1]])
    )

In [65]:
horror = X_topics[:, 5].argsort()[::-1]

for iter_idx, movie_idx in enumerate(horror[:3]):
    print(f'\nHorror movie #{(iter_idx + 1)}:')
    print(df['review'][movie_idx][:300], '...')