# IMDB sentiment prediction with pytorch

I'm reading the material in the keras book but implementing in pytorch.

I'm using [polarity data set v2.0](https://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz) to predict binary: positive or negative review. The data uncompressed it looks like:

```
data/review_polarity/txt_sentoken/
├── neg
│   ├── cv000_29416.txt
│   ├── cv001_19502.txt
│   ├── cv002_17424.txt
│   ├── cv003_12683.txt
│   ├── cv004_12641.txt
...
```

Instead of trying to learn the ancillary package torchtext, I'm going to use SpaCy to tokenize.

## Support code

In [35]:
import codecs
import os
import re
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def filelist(root):
    """Return a fully-qualified list of filenames under root directory; sort names alphabetically."""
    allfiles = []
    for path, subdirs, files in os.walk(root):
        for name in files:
            allfiles.append(os.path.join(path, name))
    return sorted(allfiles)

def get_text(filename:str):
    """
    Load and return the text of a text file, assuming latin-1 encoding as that
    is what the BBC corpus uses.  Use codecs.open() function not open().
    """
    f = codecs.open(filename, encoding='latin-1', mode='r')
    s = f.read()
    f.close()
    return s

def load_docs(docs_dirname:str):
    """
    Load all .txt files under docs_dirname and return a list of doc strings, one per doc.
    Ignore empty and non ".txt" files.
    """
    docs = []
    for filename in filelist(docs_dirname):#[0:600]:
        if not filename.endswith(".txt"): continue
        # print(f"Process {filename}")
        filetext = get_text(filename)
        if len(filetext)==0: continue
        docs.append( compress_whitespace(filetext) )
    return docs

def words(text:str):
    """
    Given a string, return a list of words normalized as follows.
    Split the string to make words first by using regex compile() function
    and string.punctuation + '0-9\\r\\t\\n]' to replace all those
    char with a space character.
    Split on space to get word list.
    Ignore words < 3 char long.
    Lowercase all words
    Remove English stop words
    """
    ctrl_chars = '\x00-\x1f'
    regex = re.compile(r'[' + ctrl_chars + string.punctuation + '0-9\r\t\n]')
    nopunct = regex.sub(" ", text)  # delete stuff but leave at least a space to avoid clumping together
    words = nopunct.split(" ")
    words = [w for w in words if len(w) > 2]  # ignore a, an, to, at, be, ...
    words = [w.lower() for w in words]
    words = [w for w in words if w not in ENGLISH_STOP_WORDS]
    return words

def compress_whitespace(s): # collapse things like "\n   \t  " with " "
    return re.sub(r"(\s+)", ' ', s)

## Load movie reviews

In [36]:
import spacy

In [141]:
neg_dirname = "data/review_polarity/txt_sentoken/neg"
pos_dirname = "data/review_polarity/txt_sentoken/pos"
negdocs = load_docs(neg_dirname)
posdocs = load_docs(pos_dirname)
n_negdocs = len(negdocs)
n_posdocs = len(posdocs)
n_negdocs, n_posdocs

(1000, 1000)

In [142]:
nlp = spacy.load("en_core_web_sm") # When I use plain English() it doesn't seem to give POS info

In [143]:
#neg_tokenized = [list(nlp(d)) for d in negdocs] # super slow!
#pos_tokenized = [list(nlp(d)) for d in posdocs]

neg_tokenized = [words(d) for d in negdocs] # much faster!
pos_tokenized = [words(d) for d in posdocs]

In [164]:
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import numpy as np

vocab_size = 500
all = []
for tokens in neg_tokenized + pos_tokenized:
    all.extend([w for w in tokens])
c = Counter(all)
V = c.most_common(vocab_size)
word_to_idx = dict(V)
V = sorted(word_to_idx.keys())

In [None]:
for neg_tokenized

In [149]:
X = torch.tensor( [[word_to_idx[w] for w in doc_tokens if w in V] for doc_tokens in neg_tokenized+pos_tokenized] )
y = np.array([0]*n_negdocs + [1]*n_posdocs).reshape(-1,1)

ValueError: expected sequence of length 4 at dim 1 (got 3)

In [None]:
X.shape

## Build simple sentiment learner / predictor

In [167]:
import torch
import torch.nn as nn

class Sentiment(nn.Module):
    def __init__(self, vocab_size, nfactors):
        super(Sentiment, self).__init__()
#         self.embedding = 
        self.layers = nn.Sequential(
            nn.Embedding(vocab_size, nfactors),
            nn.Linear(vocab_size*nfactors,128), # 128 neurons
            nn.ReLU(),
            nn.Linear(nfactors,1)
        )
        
    def forward(self, x):
        print(x.shape)
        output = self.layers(x)
        return torch.sigmoid(output)

In [168]:
def train(model, X, y, epochs=350, test_size=0.20, learning_rate = 0.002, print_every=30, loss_fn=nn.BCELoss()):
    n = len(X)
    shuffle_idx = np.random.permutation(range(n))
    X = X[shuffle_idx]
    y = y[shuffle_idx]
    n_valid = int(n*test_size)
    n_train = n - n_valid
    X_train, X_valid = X[0:n_train], X[n_train:]
    y_train, y_valid = y[0:n_train], y[n_train:]
    print(f"{len(X_train)} training and {len(X_valid)} test records")

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.00000002) # model.parameters is just [U,M]

    history = []

    for t in range(epochs):
        y_pred = model(X_train)
        loss = loss_fn(y_pred, y_train)

        optimizer.zero_grad()
        loss.backward() # autograd computes U.grad and M.grad
        optimizer.step()

    history = torch.tensor(history)
    return model, history

In [169]:
nfactors = 64
model = Sentiment(vocab_size, nfactors)
print(model)
train(model, torch.tensor(X), torch.tensor(y), epochs=5, print_every=1)

Sentiment(
  (layers): Sequential(
    (0): Embedding(500, 64)
    (1): Linear(in_features=32000, out_features=128, bias=True)
    (2): ReLU()
    (3): Linear(in_features=64, out_features=1, bias=True)
  )
)


ValueError: expected sequence of length 155 at dim 1 (got 62)