In [79]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
import string, re
from utils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


To get the dataset, in your terminal run the following commands:

wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

gunzip aclImdb_v1.tar.gz

tar -xvf aclImdb_v1.tar

In [55]:
PATH='data/aclImdb/'
names = ['neg','pos']

!ls $PATH/train/pos | head -n 10

0_9.txt
10000_8.txt
10001_10.txt
10002_7.txt
10003_8.txt
10004_8.txt
10005_7.txt
10006_7.txt
10007_7.txt
10008_7.txt


In [56]:
trn,trn_y = texts_labels_from_folders(f'{PATH}train',names)
val,val_y = texts_labels_from_folders(f'{PATH}test',names)

### Bag of words

In [82]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
veczr = CountVectorizer(tokenizer=lambda s: re_tok.sub(r' \1 ', s).split())

In [83]:
trn_term_doc = veczr.fit_transform(trn)
val_term_doc = veczr.transform(val)

In [91]:
vocab = veczr.get_feature_names();

In [92]:
len(vocab)

75132

### Naive Bayes

We define the log-count ratio $r$ for each word $f$:

$r = \log \frac{\text{ratio of feature $f$ in positive documents}}{\text{ratio of feature $f$ in negative documents}}$

where ratio of feature $f$ in positive documents is the number of times a positive document has a feature divided by the number of positive documents.

In [111]:
def pr(y_i):
    # x - document matrix
    p = x[y==y_i].sum(0)
    # probability of occuring of each word in pos (y_i == 1) or neg documents (y_i == 0)
    return (p+1) / ((y==y_i).sum()+1)

x=trn_term_doc
y=trn_y

# log - count ratio
r = np.log(pr(1)/pr(0))
# bias
b = np.log((y==1).mean() / (y==0).mean())

In [116]:
# Naive Bayes formula
pre_preds = val_term_doc @ r.T + b

preds = pre_preds.T>0
(preds==val_y).mean()

0.81656

In [117]:
# binarized Naive Bayes
x=trn_term_doc.sign()
r = np.log(pr(1)/pr(0))

pre_preds = val_term_doc.sign() @ r.T + b
preds = pre_preds.T>0
(preds==val_y).mean()

0.83016

## Logistic Regression

In [122]:
m = LogisticRegression(C=0.01, dual=True)
m.fit(x, y)
preds = m.predict(val_term_doc)
(preds==val_y).mean()

0.85128

In [123]:
m = LogisticRegression(C=0.01, dual=True)
m.fit(trn_term_doc.sign(), y)
preds = m.predict(val_term_doc.sign())
(preds==val_y).mean()

0.8798