In [169]:
import pandas as pd
import numpy as np
from pathlib import Path
import nltk
from nltk.stem.porter import PorterStemmer
import string
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from collections import Counter
import gc

In [170]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\automacao\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\automacao\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

We are going to use some approaches for sentiment analysis in order to study their performances.

# Dataset
### Large Movie Review Dataset
Downloaded from http://ai.stanford.edu/~amaas/data/sentiment/

# Summary

- [Approach 1](#approach1): Bag of Words (word **occurrences**)
    - Sklearn CountVectorizer (NLTK stopwords) -> Multinomial Naive Bayes
    - [Result](#approach1_result)
    
    
- [Approach 2](#approach2): Bag of Words (word **frequencies**)
    - Sklearn CountVectorizer (NLTK stopwords) -> TfidfTransformer (**without IDF**) -> Multinomial Naive Bayes
    - [Result](#approach2_result)
    
    
- [Approach 3](#approach3): Bag of Words (word **frequencies** + inverse document frequencies - TF-IDF)
    - Sklearn CountVectorizer (NLTK stopwords) -> TfidfTransformer (**with IDF**) -> Multinomial Naive Bayes
    - [Result](#approach3_result)
    

- [Approach 4](#approach4): Bag of Words (word **occurrences**)
    - Sklearn CountVectorizer (NLTK stopwords) -> Bernoulli Naive Bayes
    - [Result](#approach4_result)


- [Approach 5](#approach5): Bag of Words (word **frequencies**)
    - Sklearn CountVectorizer (NLTK stopwords + min/maxdf) -> TfidfTransformer (**without IDF**) -> Multinomial Naive Bayes
    - [Result](#approach5_result)

TODO - approaches:
- Use CountVectorizer with custom NLTK processing.
- NLTK preprocessing and numpy arrays. Problem: data does not fit in memory.
- NLTK preprocessing and incremental learning with naive bayes.

<a id='approach1'></a>
# Approach 1

#### Bag Of Words (word occurrences)
Sklearn CountVectorizer (NLTK stopwords) -> Multinomial Naive Bayes

In [171]:
pos_train_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/train/pos').resolve()
neg_train_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/train/neg').resolve()
pos_test_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/test/pos').resolve()
neg_test_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/test/neg').resolve()

In [172]:
#Read train data:
pos_train_raw = list()
for file in pos_train_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        pos_train_raw.append(f.read())
neg_train_raw = list()
for file in neg_train_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        neg_train_raw.append(f.read())

In [173]:
#Read test data:
pos_test_raw = list()
for file in pos_test_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        pos_test_raw.append(f.read())

neg_test_raw = list()
for file in neg_test_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        neg_test_raw.append(f.read())

In [174]:
countvec = CountVectorizer(stop_words=nltk.corpus.stopwords.words('english'), binary=True).fit(pos_train_raw + neg_train_raw)

#Prepare train data:
X_train = countvec.transform(pos_train_raw + neg_train_raw)

n_pos_samples_train = len(pos_train_raw)
n_neg_samples_train = len(neg_train_raw)
y_train = np.concatenate((np.ones((n_pos_samples_train)), np.zeros((n_neg_samples_train))))

#Fit model:
naivebayes = MultinomialNB()
naivebayes.fit(X_train, y_train)

MultinomialNB()

<a id='approach1_result'></a>
#### Result

In [175]:
#Prepare test data:
n_pos_samples_test = len(pos_test_raw)
n_neg_samples_test = len(neg_test_raw)

X_test = countvec.transform(pos_test_raw + neg_test_raw)
y_test = np.concatenate((np.ones((n_pos_samples_test)), np.zeros((n_neg_samples_test))))
y_pred = naivebayes.predict(X_test)

print('Precision: {}'.format(np.round(precision_score(y_test, y_pred), decimals=4)))
print('Recall: {}'.format(np.round(recall_score(y_test, y_pred), decimals=4)))
print('Accuracy: {}'.format(np.round(accuracy_score(y_test, y_pred), decimals=4)))

#Generate list of results for visual comparison:
results = list()
results.append({
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'accuracy': accuracy_score(y_test, y_pred)
})

Precision: 0.872
Recall: 0.7862
Accuracy: 0.8354


<a id='approach2'></a>
# Approach 2

#### Bag Of Words (word frequencies)
Sklearn CountVectorizer (NLTK stopwords) -> TfidfTransformer (without IDF) -> Naive Bayes

In [176]:
pos_train_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/train/pos').resolve()
neg_train_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/train/neg').resolve()
pos_test_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/test/pos').resolve()
neg_test_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/test/neg').resolve()

In [177]:
#Read train data:
pos_train_raw = list()
for file in pos_train_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        pos_train_raw.append(f.read())
neg_train_raw = list()
for file in neg_train_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        neg_train_raw.append(f.read())

In [178]:
#Read test data:
pos_test_raw = list()
for file in pos_test_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        pos_test_raw.append(f.read())

neg_test_raw = list()
for file in neg_test_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        neg_test_raw.append(f.read())

In [179]:
countvec = CountVectorizer(stop_words=nltk.corpus.stopwords.words('english')).fit(pos_train_raw + neg_train_raw)

#Prepare train data:
X_train = countvec.transform(pos_train_raw + neg_train_raw)
#Using term frequency tranformer:
tfidf = TfidfTransformer(use_idf=False).fit(X_train)
X_train = tfidf.transform(X_train)

n_pos_samples_train = len(pos_train_raw)
n_neg_samples_train = len(neg_train_raw)
y_train = np.concatenate((np.ones((n_pos_samples_train)), np.zeros((n_neg_samples_train))))

#Fit model:
naivebayes = MultinomialNB()
naivebayes.fit(X_train, y_train)

MultinomialNB()

<a id='approach2_result'></a>
#### Result

In [180]:
#Prepare test data:
n_pos_samples_test = len(pos_test_raw)
n_neg_samples_test = len(neg_test_raw)

X_test = countvec.transform(pos_test_raw + neg_test_raw)
X_test = tfidf.transform(X_test)
y_test = np.concatenate((np.ones((n_pos_samples_test)), np.zeros((n_neg_samples_test))))
y_pred = naivebayes.predict(X_test)

print('Precision: {}'.format(np.round(precision_score(y_test, y_pred), decimals=4)))
print('Recall: {}'.format(np.round(recall_score(y_test, y_pred), decimals=4)))
print('Accuracy: {}'.format(np.round(accuracy_score(y_test, y_pred), decimals=4)))

results.append({
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'accuracy': accuracy_score(y_test, y_pred)
})

Precision: 0.876
Recall: 0.8134
Accuracy: 0.8492


<a id='approach3'></a>
# Approach 3

#### Bag Of Words (word frequencies + inverse document frequency - IDF)
Sklearn CountVectorizer (NLTK stopwords) -> TfidfTransformer (with IDF) -> Naive Bayes

In [181]:
pos_train_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/train/pos').resolve()
neg_train_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/train/neg').resolve()
pos_test_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/test/pos').resolve()
neg_test_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/test/neg').resolve()

In [182]:
#Read train data:
pos_train_raw = list()
for file in pos_train_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        pos_train_raw.append(f.read())
neg_train_raw = list()
for file in neg_train_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        neg_train_raw.append(f.read())

In [183]:
#Read test data:
pos_test_raw = list()
for file in pos_test_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        pos_test_raw.append(f.read())

neg_test_raw = list()
for file in neg_test_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        neg_test_raw.append(f.read())

In [184]:
countvec = CountVectorizer(stop_words=nltk.corpus.stopwords.words('english')).fit(pos_train_raw + neg_train_raw)

#Prepare train data:
X_train = countvec.transform(pos_train_raw + neg_train_raw)
#Using term frequency tranformer:
tfidf = TfidfTransformer(use_idf=True).fit(X_train)
X_train = tfidf.transform(X_train)

n_pos_samples_train = len(pos_train_raw)
n_neg_samples_train = len(neg_train_raw)
y_train = np.concatenate((np.ones((n_pos_samples_train)), np.zeros((n_neg_samples_train))))

#Fit model:
naivebayes = MultinomialNB()
naivebayes.fit(X_train, y_train)

MultinomialNB()

<a id='approach3_result'></a>
#### Result

In [185]:
#Prepare test data:
n_pos_samples_test = len(pos_test_raw)
n_neg_samples_test = len(neg_test_raw)

X_test = countvec.transform(pos_test_raw + neg_test_raw)
X_test = tfidf.transform(X_test)
y_test = np.concatenate((np.ones((n_pos_samples_test)), np.zeros((n_neg_samples_test))))
y_pred = naivebayes.predict(X_test)

print('Precision: {}'.format(np.round(precision_score(y_test, y_pred), decimals=4)))
print('Recall: {}'.format(np.round(recall_score(y_test, y_pred), decimals=4)))
print('Accuracy: {}'.format(np.round(accuracy_score(y_test, y_pred), decimals=4)))

results.append({
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'accuracy': accuracy_score(y_test, y_pred)
})

Precision: 0.8687
Recall: 0.7875
Accuracy: 0.8342


<a id='approach4'></a>
# Approach 4

#### Bag Of Words (word occurrences)
Sklearn CountVectorizer (NLTK stopwords) -> Bernoulli Naive Bayes

In [186]:
pos_train_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/train/pos').resolve()
neg_train_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/train/neg').resolve()
pos_test_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/test/pos').resolve()
neg_test_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/test/neg').resolve()

In [187]:
#Read train data:
pos_train_raw = list()
for file in pos_train_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        pos_train_raw.append(f.read())
neg_train_raw = list()
for file in neg_train_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        neg_train_raw.append(f.read())

In [188]:
#Read test data:
pos_test_raw = list()
for file in pos_test_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        pos_test_raw.append(f.read())

neg_test_raw = list()
for file in neg_test_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        neg_test_raw.append(f.read())

In [189]:
countvec = CountVectorizer(stop_words=nltk.corpus.stopwords.words('english'), binary=True).fit(pos_train_raw + neg_train_raw)

#Prepare train data:
X_train = countvec.transform(pos_train_raw + neg_train_raw)

n_pos_samples_train = len(pos_train_raw)
n_neg_samples_train = len(neg_train_raw)
y_train = np.concatenate((np.ones((n_pos_samples_train)), np.zeros((n_neg_samples_train))))

#Fit model:
naivebayes = BernoulliNB()
naivebayes.fit(X_train, y_train)

BernoulliNB()

<a id='approach4_result'></a>
#### Result

In [190]:
#Prepare test data:
n_pos_samples_test = len(pos_test_raw)
n_neg_samples_test = len(neg_test_raw)

X_test = countvec.transform(pos_test_raw + neg_test_raw)
y_test = np.concatenate((np.ones((n_pos_samples_test)), np.zeros((n_neg_samples_test))))
y_pred = naivebayes.predict(X_test)

print('Precision: {}'.format(np.round(precision_score(y_test, y_pred), decimals=4)))
print('Recall: {}'.format(np.round(recall_score(y_test, y_pred), decimals=4)))
print('Accuracy: {}'.format(np.round(accuracy_score(y_test, y_pred), decimals=4)))

results.append({
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'accuracy': accuracy_score(y_test, y_pred)
})

Precision: 0.8734
Recall: 0.7442
Accuracy: 0.8182


<a id='approach5'></a>
# Approach 5

#### Bag Of Words (word frequencies)
Sklearn CountVectorizer (NLTK stopwords + min/maxdf) -> TfidfTransformer (**without IDF**) -> Multinomial Naive Bayes

In [191]:
pos_train_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/train/pos').resolve()
neg_train_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/train/neg').resolve()
pos_test_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/test/pos').resolve()
neg_test_raw_folder = (Path('..') / 'data' / 'aclImdb_v1/aclImdb/test/neg').resolve()

In [192]:
#Read train data:
pos_train_raw = list()
for file in pos_train_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        pos_train_raw.append(f.read())
neg_train_raw = list()
for file in neg_train_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        neg_train_raw.append(f.read())

In [193]:
#Read test data:
pos_test_raw = list()
for file in pos_test_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        pos_test_raw.append(f.read())

neg_test_raw = list()
for file in neg_test_raw_folder.iterdir():
    with open(file, 'r', encoding='utf-8') as f:
        neg_test_raw.append(f.read())

In [194]:
countvec = CountVectorizer(stop_words=nltk.corpus.stopwords.words('english'), binary=False, 
                          min_df=0.005, max_df=0.98).fit(pos_train_raw + neg_train_raw)

#Prepare train data:
X_train = countvec.transform(pos_train_raw + neg_train_raw)
tfidf = TfidfTransformer(use_idf=False).fit(X_train)
X_train = tfidf.transform(X_train)

n_pos_samples_train = len(pos_train_raw)
n_neg_samples_train = len(neg_train_raw)
y_train = np.concatenate((np.ones((n_pos_samples_train)), np.zeros((n_neg_samples_train))))

#Fit model:
naivebayes = BernoulliNB()
naivebayes.fit(X_train, y_train)

BernoulliNB()

<a id='approach5_result'></a>
#### Result

In [195]:
#Prepare test data:
n_pos_samples_test = len(pos_test_raw)
n_neg_samples_test = len(neg_test_raw)

X_test = countvec.transform(pos_test_raw + neg_test_raw)
X_test = tfidf.transform(X_test)
y_test = np.concatenate((np.ones((n_pos_samples_test)), np.zeros((n_neg_samples_test))))
y_pred = naivebayes.predict(X_test)

print('Precision: {}'.format(np.round(precision_score(y_test, y_pred), decimals=4)))
print('Recall: {}'.format(np.round(recall_score(y_test, y_pred), decimals=4)))
print('Accuracy: {}'.format(np.round(accuracy_score(y_test, y_pred), decimals=4)))

results.append({
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'accuracy': accuracy_score(y_test, y_pred)
})

Precision: 0.8544
Recall: 0.8395
Accuracy: 0.8482


Custom preprocessing with NLTK approach:

In [52]:
def preprocess(text, language='english'):
    #Lower case -> remove punctuation -> tokenization -> remove stopwords
    out = text.lower()
    out = "".join([char for char in out if char not in string.punctuation])
    out = nltk.word_tokenize(out)
    out = [word for word in out if word not in nltk.corpus.stopwords.words(language)]
    stemer = PorterStemmer()
    out = [stemer.stem(word) for word in out]
    return out

In [58]:
#Preprocessing to generate tokens:
pos_train = list()
for text in pos_train_raw:
    pos_train.append(preprocess(text))
neg_train = list()
for text in neg_train_raw:
    neg_train.append(preprocess(text))

In [131]:
#Create the vocab set:
vocab = set()
for tokens in pos_train:
    for token in tokens:
        vocab.add(token)
for tokens in neg_train:
    for token in tokens:
        vocab.add(token)

#Generate the mapping (word -> position in X), (position in X -> word):
position_word = dict()
word_position = dict()
count = 0
for word in vocab:
    position_word[count] = word
    word_position[word] = count
    count += 1

In [41]:
gc.collect()

384

In [155]:
#Initialize X and y:
n_pos_samples = len(pos_train)
n_neg_samples = len(neg_train)
n_vocab = len(vocab)
X = np.zeros((n_pos_samples + n_neg_samples, n_vocab), dtype=np.int8)
y = np.concatenate((np.ones((n_pos_samples, 1)), np.zeros((n_neg_samples, 1))))

#Generate the X matrix with word counts:
for idx, tokens in enumerate(pos_train + neg_train):
    word_counts = Counter(tokens)
    for word in word_counts.keys():
        X[idx, word_position[word]] = word_counts[word]

#TODO - option for word occurrences
#TODO - option for TFIDF

In [165]:
from scipy.sparse import csr_matrix

In [258]:
csr_matrix(np.zeros((n_pos_samples + n_neg_samples, n_vocab), dtype=np.int8))

MemoryError: Unable to allocate 2.16 GiB for an array with shape (25000, 92756) and data type int8

In [267]:
np.zeros((n_pos_samples + n_neg_samples, 10000), dtype=np.int8)

MemoryError: Unable to allocate 238. MiB for an array with shape (25000, 10000) and data type int8

In [162]:
naivebayes = MultinomialNB()

In [237]:
naivebayes.fit(X, y)

NameError: name 'X' is not defined

In [224]:
pos_train_tokens = list()
for tokens in pos_train:
    string = ''
    for token in tokens:
        string = string + token + ' '
    string = string.strip()
    pos_train_tokens.append(string)

neg_train_tokens = list()
for tokens in neg_train:
    string = ''
    for token in tokens:
        string = string + token + ' '
    string = string.strip()
    neg_train_tokens.append(string)

In [251]:
countvec = CountVectorizer()
countvec.fit(pos_train_tokens + neg_train_tokens)

CountVectorizer()

In [None]:
def tokenize(text):
    text = "".join([char for char in text if char not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    return tokens

countvec = CountVectorizer(tokenizer=tokenize, stop_words=nltk.corpus.stopwords.words('english'))
countvec.fit(pos_train_raw + neg_train_raw)

  'stop_words.' % sorted(inconsistent))


In [312]:
countvec.transform(pos_train_raw + neg_train_raw).shape

(25000, 74704)

In [284]:
countvec.transform(pos_train_tokens + neg_train_tokens).shape

(25000, 92345)

In [285]:
teste = list()
for word in countvec.vocabulary_.keys():
    if word not in vocab:
        print(word)
        teste.append(word)

it
enemies
homosexuality
nights
ll
credits
standards
fantastic
wouldn
grotesques
at
cutting
onli
props
swings
crocodiles
ones
conventions
nature
films
leads
herself
tari
society
filmed
violence
picture
russwil
beckinsale
suitors
the
didn
kidnapped
imagery
lik
coincidentally
since
ambiguity
knowing
ambiguous
peaceful
vance
1940th
lives
for
than
adequate
ordered
oherlihy
progressive
filipinos
reactions
again
springs
greatness
biographies
surprise
direction
family
1987al
particulare
isn
apache
illinois
1940bi
peli
upside
or
epochs
times
virtuously
pac
popeye
rivals
won
orna
bubblebr
simple
obfuscated
television
doesn
dramas
characters
assembled
officials
sso
briainbr
himself
such
tdefinit
wasn
49484
13516
1697
oppression
looks
079
images
ain
history
boheme
medical
episode
future
lookalike
kids
wel
parenting
requires
travesty
brant
polarbear
unflinchingly
mentions
hasn
possessing
veri
impossible
galaxy
tried
crazy
people
brilli
melville
anthology
nudity
cyborgs
features
ending
actingthor
z

In [281]:
teste = list()
for word in vocab:
    if word not in countvec.vocabulary_:
        print(word)
        teste.append(word)

9
bubblebr
forest¨1936
conventionsa
wellpac
lot´
bg´
you´v
a
character´
parentingwher
endingin
loserto
maybe´
even
§1000
shorti
andalthough
ladybug´
stefan´
moviejust
moviebecaus
£399
¡§rocket
£200
alonzo
£250
lugosiyet
¨thousand
i´d
thirdrat
don´t
¨jurassik
»
we´ll
tribesmenthu
it´d
¨zane
ambiguitythi
scream
k
what´
creditsand
projectbut
70£
youthi
pitt´
polarbearand
old
tamer¨
nobudget
deadonli
dalmatian
beckinsale´
¡§just
4
£9
fulci´
§12
method
you´ll
¾
them
big
2004¡¨
wellmostli
problembrilli
guyhi
futuremor
nearand
butmom
movieand
kidnappedin
brant´
£1775
hima
womenadela
oherlihyto
deathit
insignific
price…but
wagoncomplet
“
attictherel
riemann´
director¡¦
b
w
token
when
99¢
doesn´t
morganbut
featuresthat
zombiesnatch
pointjust
fear
enemiesth
upsidedownor
60´
orna
rivalskeaton
cgiwhich
macarthurlik
episodeher
didn´t
hammi
¨petrifi
¡§at
caus
3
obfuscatedthread
he´
atrociousth
charactersolivia
binso
hasn´tbr
wholik
ta