# HW04: Sentiment Analysis

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.model import fit
from fastai.dataset import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle

In [2]:
bs,bptt = 32,35

## Language modeling

### Data

In [3]:
PATH='./data/sentiment/'

df_imdb = pd.read_csv(f'{PATH}imdb_labelled.txt', sep='\t', header=None, names=['text', 'label'])
df_amzn = pd.read_csv(f'{PATH}amazon_cells_labelled.txt', sep='\t', header=None, names=['text', 'label'])
df_yelp = pd.read_csv(f'{PATH}yelp_labelled.txt', sep='\t', header=None, names=['text', 'label'])

df_all = pd.concat([df_imdb, df_amzn, df_yelp])
n=len(df_all);

print(n)
df_imdb.head()

2748


Unnamed: 0,text,label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
os.makedirs(f'{PATH}trn/yes', exist_ok=True)
os.makedirs(f'{PATH}val/yes', exist_ok=True)
os.makedirs(f'{PATH}trn/no', exist_ok=True)
os.makedirs(f'{PATH}val/no', exist_ok=True)
os.makedirs(f'{PATH}all/trn', exist_ok=True)
os.makedirs(f'{PATH}all/val', exist_ok=True)
os.makedirs(f'{PATH}models', exist_ok=True)

In [5]:
for (i,(_,r)) in enumerate(df_all.iterrows()):
    dset = 'trn' if random.random()>0.1 else 'val'
    open(f'{PATH}all/{dset}/{i}.txt', 'w').write(r['text'])

In [6]:
for (i,(_,r)) in enumerate(df_imdb.iterrows()):
    lbl = 'yes' if r.label else 'no'
    dset = 'trn' if random.random()>0.1 else 'val'
    open(f'{PATH}{dset}/{lbl}/{i}.txt', 'w').write(r['text'])

In [7]:
from spacy.symbols import ORTH

my_tok = spacy.load('en')
# my_tok.tokenizer.add_special_case('<SUMM>', [{ORTH: '<SUMM>'}])

def my_spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(x)]

In [8]:
TEXT = data.Field(lower=True, tokenize=my_spacy_tok)
FILES = dict(train='trn', validation='val', test='val')
md = LanguageModelData.from_text_files(f'{PATH}all/', TEXT, **FILES, bs=bs, bptt=bptt, min_freq=10)
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

In [9]:
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(37, 465, 1, 42944)

In [10]:
TEXT.vocab.itos[:12]

['<unk>', '<pad>', '<eos>', '.', 'the', ',', 'and', 'i', 'a', ' ', 'it', 'is']

In [11]:
' '.join(md.trn_ds[0].text[:150])

"so mediocre in every aspect that it just becomes a dull , uninteresting mess , this is one of the most forgettable movies i 've seen .   <eos> i checked out this place a couple years ago and was not impressed . <eos> probably not in a hurry to go back . <eos> star trek v the final frontier is the worst in the series .   <eos> ordered burger rare came in we 'll done . <eos> nine out of ten for a truly lovely film .   <eos> john garfield , ann revere , lilli plmer , william conrad , canada lee ... and filmed by one of the greatest cinematographers to ever grace the screen .. james wong howe .   <eos> at least think to refill my water before i struggle to wave you over for 10 minutes . <eos> reversible plug works great ."

### Train

In [12]:
em_sz = 200
nh = 500
nl = 3
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [13]:
learner = md.get_model(opt_fn, em_sz, nh, nl,
    dropout=0.05, dropouth=0.1, dropouti=0.05, dropoute=0.02, wdrop=0.2)
# dropout=0.4, dropouth=0.3, dropouti=0.65, dropoute=0.1, wdrop=0.5
#                dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3

In [14]:
learner.fit(3e-3, 1, wds=1e-6)

epoch      trn_loss   val_loss   
    0      4.454939   4.012359  



[4.012359]

In [15]:
learner.fit(3e-3, 3, wds=1e-6, cycle_len=1, cycle_mult=2)

epoch      trn_loss   val_loss   
    0      4.038786   3.842084  
    1      3.938289   3.662227  
    2      3.809719   3.592     
    3      3.728144   3.461488  
    4      3.620932   3.37203   
    5      3.518939   3.312052  
    6      3.444068   3.316622  



[3.3166218]

In [16]:
learner.save_encoder('adam2_enc')

keep on running this block until you get a good trn and val loss that is not overfitting

In [20]:
learner.fit(3e-3, 1, wds=1e-6, cycle_len=5, cycle_save_name='adam3_10')

epoch      trn_loss   val_loss   
    0      2.705318   2.911269  
    1      2.65063    2.820654  
    2      2.570236   2.768576  
    3      2.477256   2.779352  
    4      2.376802   2.741787  



[2.741787]

In [21]:
learner.save_encoder('adam3_10_enc')

### Test

In [22]:
def proc_str(s): return TEXT.preprocess(TEXT.tokenize(s))
def num_str(s): return TEXT.numericalize([proc_str(s)])

In [23]:
m=learner.model

In [24]:
s="""very, very slow-moving, aimless movie"""

In [25]:
def sample_model(m, s, l=50):
    t = num_str(s)
    m[0].bs=1
    m.eval()
    m.reset()
    res,*_ = m(t)
    print('...', end='')

    for i in range(l):
        n=res[-1].topk(2)[1]
        n = n[1] if n.data[0]==0 else n[0]
        word = TEXT.vocab.itos[n.data[0]]
        print(word, end=' ')
        if word=='<eos>': break
        res,*_ = m(n[0].unsqueeze(0))

    m[0].bs=bs

In [26]:
sample_model(m,s)

....   <eos> 

### Sentiment

In [27]:
TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb'))

In [28]:
class ReviewDataset(torchtext.data.Dataset):
    def __init__(self, path, text_field, label_field, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        examples = []
        for label in ['yes', 'no']:
            for fname in glob(os.path.join(path, label, '*.txt')):
                with open(fname, 'r') as f: text = f.readline()
                examples.append(data.Example.fromlist([text, label], fields))
        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex): return len(ex.text)
    
    @classmethod
    def splits(cls, text_field, label_field, root='.data',
               train='train', test='test', **kwargs):
        return super().splits(
            root, text_field=text_field, label_field=label_field,
            train=train, validation=None, test=test, **kwargs)

In [29]:
REV_LABEL = data.Field(sequential=False)
splits = ReviewDataset.splits(TEXT, REV_LABEL, PATH, train='trn', test='val')

In [30]:
md2 = TextData.from_splits(PATH, splits, bs)

In [31]:
#            dropout=0.3, dropouti=0.4, wdrop=0.3, dropoute=0.05, dropouth=0.2)

In [32]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

def prec_at_6(preds,targs):
    precision, recall, _ = precision_recall_curve(targs==2, preds[:,2])
    print(recall[precision>=0.6][0])
    return recall[precision>=0.6][0]

In [33]:
# dropout=0.4, dropouth=0.3, dropouti=0.65, dropoute=0.1, wdrop=0.5
m3 = md2.get_model(opt_fn, 1500, bptt, emb_sz=em_sz, n_hid=nh, n_layers=nl, 
           dropout=0.1, dropouti=0.65, wdrop=0.5, dropoute=0.1, dropouth=0.3)
m3.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
m3.clip=25.

In [34]:
m3.load_encoder(f'adam3_10_enc')
lrs=np.array([1e-4,1e-3,1e-2])

In [35]:
m3.freeze_to(-1)
m3.fit(lrs/2, 1, metrics=[accuracy])
m3.unfreeze()
m3.fit(lrs, 1, metrics=[accuracy], cycle_len=1)

epoch      trn_loss   val_loss   accuracy   
    0      1.163582   1.09133    0.364583  



epoch      trn_loss   val_loss   accuracy   
    0      1.148581   1.012145   0.536458  



[1.0121454, 0.5364583333333334]

In [36]:
m3.fit(lrs, 2, metrics=[accuracy], cycle_len=4, cycle_save_name='imdb2')

epoch      trn_loss   val_loss   accuracy   
    0      1.129385   1.091952   0.333333  
    1      1.100783   1.036172   0.546875  
    2      1.07859    0.908407   0.682292  
    3      1.063179   0.891331   0.697917  
    4      1.061997   0.911527   0.692708  
    5      1.041195   0.995657   0.536458  
    6      1.008784   0.827958   0.703125  
    7      0.973858   0.811779   0.75      



[0.8117791, 0.75]

In [37]:
prec_at_6(*m3.predict_with_targs())

0.9777777777777777


0.9777777777777777

In [38]:
m3.fit(lrs, 4, metrics=[accuracy], cycle_len=2, cycle_save_name='imdb2')

epoch      trn_loss   val_loss   accuracy   
    0      0.976724   0.8195     0.734375  
    1      0.970603   0.808093   0.708333  
    2      0.969395   0.703919   0.828125  
    3      0.924948   0.664096   0.854167  
    4      0.927131   0.697051   0.838542  
    5      0.900092   0.647931   0.848958  
    6      0.889584   0.656233   0.833333  
    7      0.856229   0.594734   0.885417  



[0.59473425, 0.8854166666666666]

In [39]:
prec_at_6(*m3.predict_with_targs())

0.9888888888888889


0.9888888888888889