In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import fastai
from fastai import *
from fastai.text import *

torch.cuda.device(0)
print(torch.cuda.get_device_name(0))
print(f'pytorch {torch.__version__}, fastai {fastai.__version__}')

%reload_ext autoreload
%autoreload 2
%matplotlib inline

GeForce GTX 1050 Ti
pytorch 1.4.0, fastai 1.0.60


# ULMFiT IMDB Sentiment Analysis 

In [2]:
df = pd.read_csv('data/imdb_reviews.csv', index_col=0)
df['sentiment'] = df.positive.map({True: 1, False:0})
df['text'] = df['review']
df = df.drop(['positive', 'negative', 'review'], axis=1)
print(df.shape)
df.head()

(23000, 2)


Unnamed: 0,sentiment,text
0,0,If this is a 2008 product from one of the bigg...
1,1,I don't give a movie or a show ten very often ...
2,1,This is comedy as it once was and comparing th...
3,1,'How to Lose Friends and Alienate People' is a...
4,1,EARTH (2009) ***1/2 Big screen adaptation of t...


Split into training, validation and hold-out test set (`df_test`)

In [3]:
from sklearn.model_selection import train_test_split

# Split data into training and validation set
df_trn, df_val = train_test_split(df, stratify=df['sentiment'],
                                  test_size=0.2, random_state=1)
print(df_trn.shape, df_val.shape)

(18400, 2) (4600, 2)


In [4]:
%%time
path = Path
path = path('data')
df_trn.to_csv(path / 'imdb_train_sample.csv')
df_val.to_csv(path / 'imdb_val_sample.csv')

Wall time: 831 ms


In [5]:
df_trn

Unnamed: 0,sentiment,text
385,0,"I watched this movie, and hoped for something ..."
4234,0,"Come on, let's get real. The Knights of Christ..."
15666,1,"Very slow-paced, but intricately structured an..."
774,0,"A slow, tedious, and one dimensional movie! Go..."
22061,0,"*SPOILERS INCLUDED*\n\n\n\nWith a title like ""..."
...,...,...
1429,1,I have been a fan of Pushing Daisies since the...
20301,0,If you really have to watch this movie because...
11091,1,Stumbling upon this HBO special late one night...
2679,1,"I hadn't seen this film in probably 35 years, ..."



### Tokenization
The first step of processing we make the texts go through is to split the raw sentences into words, or `tokens`.

In [6]:
%%time
data = TextClasDataBunch.from_csv(path, 'imdb_train_sample.csv', 
                                  text_cols='text', label_cols='sentiment')
data.show_batch()

text,target
xxbos xxmaj match 1 : xxmaj tag xxmaj team xxmaj table xxmaj match xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley vs xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley started things off with a xxmaj tag xxmaj team xxmaj table xxmaj match against xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit . xxmaj according to the rules,1
"xxbos xxmaj okay , so i 'm not a big video game buff , but was the game xxmaj house of the xxmaj dead really famous enough to make a movie from ? xxmaj sure , they went as far as to actually put in quick video game clips throughout the movie , as though justifying any particular scene of violence , but there are dozens and dozens of games",0
"xxbos xxup anchors xxup aweigh sees two eager young sailors , xxmaj joe xxmaj brady ( xxmaj gene xxmaj kelly ) and xxmaj clarence xxmaj doolittle / xxmaj brooklyn ( xxmaj frank xxmaj sinatra ) , get a special four - day shore leave . xxmaj eager to get to the girls , particularly xxmaj joe 's xxmaj lola , neither xxmaj joe nor xxmaj brooklyn figure on the interruption",1
"xxbos xxmaj prior to this release , xxmaj neil labute had this to say about the 1973 original : "" xxmaj it 's surprising how many people say it 's their favorite soundtrack . i 'm like , come on ! xxmaj you may not like the new one , but if that 's your favorite soundtrack , i do n't know if i * want * you to like",0
"xxbos xxmaj god ! xxmaj zorro has been the the subject of about as many movies as xxmaj tarzan , and probably had about as many actors in the title role . \n \n \n \n xxmaj this xxmaj serial is one of my own personal favourites , and as previously stated , it is one of the xxmaj top 5 xxmaj sound xxmaj serials . xxmaj oddly enough",1


Wall time: 44.4 s


### Numericalization into vocab
* Creating unique tokens for words  
* Top 60,000 used by default - unknown token xxunk used for remainders  
* Special characters are also tokenised (spaces, punctuation, new lines)  
* xxbos is the token for beginning of sentence  

In [7]:
# Top 10 words
data.vocab.itos[:10]

['xxunk',
 'xxpad',
 'xxbos',
 'xxeos',
 'xxfld',
 'xxmaj',
 'xxup',
 'xxrep',
 'xxwrep',
 'the']

In [8]:
# Example tokenised review
data.train_ds[0][0].text[:200]

'xxbos xxup forbidden xxup planet is one of the best examples of xxmaj hollywood xxup sf films . xxmaj its influence was felt for more than a decade . xxmaj however , certain elements relating to how t'

In [9]:
# Example numerical token mapping onto index
data.train_ds[0][0].data[:10]

array([   2,    6, 4042,    6, 1122,   16,   44,   14,    9,  135], dtype=int64)


### For sentiment analysis we are creating two models
1. A language model data_lm (fine-tuned encoder, no labels)  
2. A text classification model data_clas (with labels)

#### 1. Language Model

* Model AWD_LSTM is pretrained on a processed subset of wikipedia wikitext-103
* This RNN model is trained to predict what the next word in the sequence is
* It has a recurrent structure and a hidden state (updated each time it sees a new word), which contains information about the sentence

In [10]:
# Decrease batchsize if GPU can't handle the load
bs = 24       # range 12 - 48

In [11]:
%%time
# Language Model data
data_lm = TextLMDataBunch.from_df(path, df_trn, df_val)

print('Training and validation shape:\n', df_trn.shape, df_val.shape)
data_lm.show_batch(rows=1)

Training and validation shape:
 (18400, 2) (4600, 2)


idx,text
0,"( if you can call it that ) , i get it . \n \n \n \n xxmaj characters are undeveloped , relationships are n't given enough time to be understood . xxmaj in one scene xxmaj sarah says they wo n't fall in love , and the next time we see her she 's talking about how his death really shook her up because they were so close"


Wall time: 42.8 s


In [12]:
data_lm.show_batch(rows=3)

idx,text
0,"abound in this film . \n \n \n \n xxmaj it 's like someone watched xxmaj boogie xxmaj nights and wrote this part to mimic xxmaj little xxmaj bill . xxmaj even the scene where he "" loses his temper "" is the same as when xxmaj little xxmaj bill shoots his wife , down to the facial expression ( or lack thereof ) . xxmaj yes , xxmaj"
1,"guest appearances all the old casts of these shows . xxmaj by the way , what was wrong with xxup cbs doing this reunion , or an eventual series ? xxmaj was n't that the network that carried the "" xxup mtm "" and "" xxmaj rhoda "" shows ? xxbos xxmaj halloween is the story of a boy who was misunderstood as a child . xxmaj he takes out"
2,""" xxmaj dolls "" which are ran on xxup a.i. and are used as grunt infantry . \n \n \n \n xxmaj the whole point of the series is a metaphysical question that gives xxmaj gundam xxmaj wing an edge over a greater portion of the xxmaj anime that makes it 's way here . xxmaj especially since most anime is adapted from popular magazines , such as xxmaj"


In [13]:
# Transfer learning Model AWD_LSTM pre-trained on WikiText103

learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3)

In [14]:
%%time
# Find best learning rate from slope
learn.lr_find()
learn.recorder.plot(suggestion=True, skip_end=15)

epoch,train_loss,valid_loss,accuracy,time


LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


RuntimeError: CUDA out of memory. Tried to allocate 726.00 MiB (GPU 0; 4.00 GiB total capacity; 2.68 GiB already allocated; 243.61 MiB free; 2.71 GiB reserved in total by PyTorch)

In [15]:
# Only the last layer is unfrozen during training
learn.summary()

SequentialRNN
Layer (type)         Output Shape         Param #    Trainable 
RNNDropout           [70, 400]            0          False     
______________________________________________________________________
RNNDropout           [70, 1152]           0          False     
______________________________________________________________________
RNNDropout           [70, 1152]           0          False     
______________________________________________________________________
Linear               [70, 42384]          16,995,984 True      
______________________________________________________________________
RNNDropout           [70, 400]            0          False     
______________________________________________________________________

Total params: 16,995,984
Total trainable params: 16,995,984
Total non-trainable params: 0
Optimized with 'torch.optim.adam.Adam', betas=(0.9, 0.99)
Using true weight decay as discussed in https://www.fast.ai/2018/07/02/adam-weight-decay/ 
Loss fu

In [16]:
# Training/fine-tuning final layer to imdb reviews

learn.fit_one_cycle(1, 1e-2, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time


RuntimeError: CUDA out of memory. Tried to allocate 726.00 MiB (GPU 0; 4.00 GiB total capacity; 2.30 GiB already allocated; 239.61 MiB free; 2.71 GiB reserved in total by PyTorch)

In [None]:
%%time
# Run until valid_loss comes down to training_loss (past this is overfitting to training set)

learn.fit_one_cycle(3, 1e-3, moms=(0.8,0.7))
learn.recorder.plot_losses()

In [None]:
learn.fit_one_cycle(10, 1e-3, moms=(0.8,0.7))
learn.recorder.plot_losses()


* Now, the encoder is fine tuned to IMDB Reviews
* The encoder can be used to predict the next word in a sentence
* The next step is to remove the final layers of the encoder, and replace them with a classification/regression model

In [None]:
data_lm.train_ds.inner_df.shape

In [None]:
data_lm.valid_ds.inner_df.shape

In [None]:
learn.predict("I really loved the film, the plot was")

In [None]:
learn.predict("I hated the film, the plot was")

In [None]:
text = "The food is good and the staff"
words = 40

print(learn.predict(text, words, temperature=0.75))