In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai import *
from fastai.text import *
from fastai.vision import *

In [3]:
import sklearn.feature_extraction.text as sklearn_text

In [4]:
?? URLs

[0;31mInit signature:[0m  [0mURLs[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mURLs[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"Global constants for dataset and model URLs."[0m[0;34m[0m
[0;34m[0m    [0mLOCAL_PATH[0m [0;34m=[0m [0mPath[0m[0;34m.[0m[0mcwd[0m[0;34m([0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0mS3[0m [0;34m=[0m [0;34m'https://s3.amazonaws.com/fast-ai-'[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0mS3_IMAGE[0m    [0;34m=[0m [0;34mf'{S3}imageclas/'[0m[0;34m[0m
[0;34m[0m    [0mS3_IMAGELOC[0m [0;34m=[0m [0;34mf'{S3}imagelocal/'[0m[0;34m[0m
[0;34m[0m    [0mS3_NLP[0m      [0;34m=[0m [0;34mf'{S3}nlp/'[0m[0;34m[0m
[0;34m[0m    [0mS3_COCO[0m     [0;34m=[0m [0;34mf'{S3}coco/'[0m[0;34m[0m
[0;34m[0m    [0mS3_MODEL[0m    [0;34m=[0m [0;34mf'{S3}modelzoo/'[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0;31m# main da

It is always good to start working on a sample of your data before you use the full dataset-- this allows for quicker computations as you debug and get your code working. For IMDB, there is a sample dataset already available:

In [5]:
# trn_term_doc = scipy.sparse.load_npz("trn_term_doc.npz")
# val_term_doc = scipy.sparse.load_npz("val_term_doc.npz")

In [6]:
def get_term_doc_matrix(label_list, vocab_len):
    j_indices = []
    indptr = []
    values = []
    indptr.append(0)

    for i, doc in enumerate(label_list):
        feature_counter = Counter(doc.data)
        j_indices.extend(feature_counter.keys())
        values.extend(feature_counter.values())
        indptr.append(len(j_indices))
        
#     return (values, j_indices, indptr)

    return scipy.sparse.csr_matrix((values, j_indices, indptr),
                                   shape=(len(indptr) - 1, vocab_len),
                                   dtype=int)

In [7]:
path = untar_data(URLs.IMDB)
path.ls()

[PosixPath('/Users/lakshay/.fastai/data/imdb/test'),
 PosixPath('/Users/lakshay/.fastai/data/imdb/tmp_clas'),
 PosixPath('/Users/lakshay/.fastai/data/imdb/imdb.vocab'),
 PosixPath('/Users/lakshay/.fastai/data/imdb/unsup'),
 PosixPath('/Users/lakshay/.fastai/data/imdb/README'),
 PosixPath('/Users/lakshay/.fastai/data/imdb/tmp_lm'),
 PosixPath('/Users/lakshay/.fastai/data/imdb/train')]

In [8]:
(path/'train').ls()

[PosixPath('/Users/lakshay/.fastai/data/imdb/train/neg'),
 PosixPath('/Users/lakshay/.fastai/data/imdb/train/pos'),
 PosixPath('/Users/lakshay/.fastai/data/imdb/train/unsupBow.feat'),
 PosixPath('/Users/lakshay/.fastai/data/imdb/train/labeledBow.feat')]

In [9]:
reviews_full = (TextList.from_folder(path)
             #grab all the text files in path
             .split_by_folder(valid='test')
             #split by train and valid folder (that only keeps 'train' and 'test' so no need to filter)
             .label_from_folder(classes=['neg', 'pos']))
             #label them all with their folders

In [10]:
len(reviews_full.train), len(reviews_full.valid)

(25000, 25000)

We will store the vocab in a variable `v` since we will be using it frequently:

In [11]:
v = reviews_full.vocab

In [12]:
v.itos[100:110]

['bad',
 'people',
 'will',
 'other',
 'also',
 'into',
 'first',
 'because',
 'great',
 'how']

In [13]:
%%time
val_term_doc = get_term_doc_matrix(reviews_full.valid.x, len(reviews_full.vocab.itos))

CPU times: user 4.01 s, sys: 87.9 ms, total: 4.1 s
Wall time: 4.11 s


In [14]:
%%time
trn_term_doc = get_term_doc_matrix(reviews_full.train.x, len(reviews_full.vocab.itos))

CPU times: user 4.12 s, sys: 147 ms, total: 4.27 s
Wall time: 4.25 s


In [43]:
x = trn_term_doc
y = reviews_full.train.y


val_y = reviews_full.valid.y.items

In [16]:
x

<25000x38456 sparse matrix of type '<class 'numpy.int64'>'
	with 3716267 stored elements in Compressed Sparse Row format>

In [18]:
positive = y.c2i['pos']
negative = y.c2i['neg']

p1 = np.squeeze(np.asarray(x[y.items==positive].sum(0)))
p0 = np.squeeze(np.asarray(x[y.items==negative].sum(0)))

In [19]:
p1[:20]

array([ 28449,      0,  12500,      0,      0, 342619,  20464,   1338,      7, 173122, 138001, 143763,  89570,  83404,
        76828,  66715,  58510,  47896,  50177,  40451], dtype=int64)

In [20]:
def word_ratio(word):
    return( p0[v.stoi[word]]/p1[v.stoi[word]])

In [37]:
pr1 = (p1+1) / ((y.items==positive).sum()+1)
pr0 = (p0+1) / ((y.items==negative).sum()+1)

In [38]:
r = np.log(pr1/pr0)

In [39]:
r[v.stoi['loved']]

1.1563661500586044

In [40]:
#add a bias for terms which are in the oversampled class

b = log((y.items==positive).mean() / (y.items==negative).mean()); b

0.0

In [41]:
preds = (val_term_doc * r + b)>0

In [44]:
(preds==val_y).mean()

0.8084

In [45]:
x = trn_term_doc.sign()
y = reviews_full.train.y

In [47]:
x.todense()[:10,:10]

matrix([[0, 0, 1, 0, ..., 0, 0, 0, 1],
        [0, 0, 1, 0, ..., 1, 0, 0, 1],
        [0, 0, 1, 0, ..., 1, 0, 0, 1],
        [1, 0, 1, 0, ..., 1, 1, 0, 1],
        ...,
        [1, 0, 1, 0, ..., 0, 0, 0, 1],
        [1, 0, 1, 0, ..., 1, 0, 0, 1],
        [0, 0, 1, 0, ..., 1, 0, 0, 1],
        [1, 0, 1, 0, ..., 0, 0, 0, 1]])

In [48]:
trn_term_doc.todense()[:10,:10]

matrix([[ 0,  0,  1,  0, ...,  0,  0,  0,  2],
        [ 0,  0,  1,  0, ...,  4,  0,  0, 12],
        [ 0,  0,  1,  0, ...,  1,  0,  0,  4],
        [ 4,  0,  1,  0, ...,  4,  1,  0, 19],
        ...,
        [ 3,  0,  1,  0, ...,  0,  0,  0, 10],
        [ 2,  0,  1,  0, ...,  1,  0,  0,  4],
        [ 0,  0,  1,  0, ...,  1,  0,  0,  9],
        [ 2,  0,  1,  0, ...,  0,  0,  0, 16]])

In [50]:
negative = y.c2i['neg']
positive = y.c2i['pos']

In [51]:
p1 = np.squeeze(np.asarray(x[y.items==positive]).sum(0))
p0 = np.squeeze(np.asarray(x[y.items==negative]).sum(0))

In [52]:
r = np.log(pr1/pr0)
b = np.log((y.items==positive).mean() / (y.items==negative).mean())

preds = (val_term_doc.sign() @ r + b) > 0

In [53]:
(preds==val_y).mean()

0.8328