# Assignment 2

In [1]:
import os
os.chdir('/home/ubuntu/fastai/courses/dl1/')

In [22]:
# Import Library
import numpy as np
from functools import partial
import pandas as pd

import dill as pickle
import torchtext
import torch
from torch import nn
from torch.nn import functional as F
import spacy
import fastai.nlp
from fastai.learner import fit
from fastai.core import V
from fastai.metrics import accuracy
from fastai.lm_rnn import seq2seq_reg, repackage_var
from fastai.nlp import RNN_Learner
from IPython.display import display
from IPython.lib.display import FileLink

from pathlib import Path

In [3]:
df_train = pd.read_csv('./data/ass2/train.csv')

In [48]:
len(df_train)

5422

In [4]:
df_train.head()

Unnamed: 0,id,text,airline,tweet_location,user_timezone,sentiment
0,0,@JetBlue great flight! Great view! :-) http://...,Delta,,,positive
1,1,"@united they're not, actually. gate agent was ...",United,chicago,,negative
2,2,@AmericanAir No worries they called back 4 hrs...,American,"Dallas, Texas",,negative
3,3,@united thank you. There was one here a few mo...,United,"New York, NY",America/New_York,positive
4,4,@united Brothers luggage was lost on Copa Airl...,United,"Kearney, Nebraska",Central Time (US & Canada),negative


In [5]:
df_test = pd.read_csv('./data/ass2/test.csv')

In [6]:
df_test.head()

Unnamed: 0,id,text,airline,tweet_location,user_timezone
0,0,@USAirways if one with @AmericanAir why can't ...,US Airways,,
1,1,@VirginAmerica You'd think paying an extra $10...,Virgin America,San Diego,Alaska
2,2,"@united according to your DMs, I'm not owed a ...",United,Nottingham,London
3,3,"@USAirways booked an award ticket recently, no...",US Airways,USA,Eastern Time (US & Canada)
4,4,@JetBlue Awesome! #bestairlineever,Delta,NYC,Quito


### Explore data

In [7]:
df_train.sentiment.value_counts(dropna=False)

negative    8242
positive    2145
Name: sentiment, dtype: int64

In [8]:
df_train = df_train.dropna()

In [9]:
df_train = df_train.sample(frac=1, random_state=42)

In [10]:
train_df = df_train[:-1000]
val_df = df_train[-1000:]

In [11]:
print("training size ",train_df.shape)
print("training size ",val_df.shape)

training size  (4422, 6)
training size  (1000, 6)


In [12]:
print("Check training data")
print(train_df.sentiment.value_counts())

Check training data
negative    3408
positive    1014
Name: sentiment, dtype: int64


In [13]:
print("Check validation data")
print(val_df.sentiment.value_counts())

Check validation data
negative    769
positive    231
Name: sentiment, dtype: int64


In [65]:
class DataFrameDataset(torchtext.data.Dataset):

    def __init__(self, df, text_field, label_field, is_test=False, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        examples = []
        for i, row in df.iterrows():
            label = row.sentiment if not is_test else None
            text = row.text
            examples.append(torchtext.data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls, text_field, label_field, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)

        if train_df is not None:
            train_data = cls(train_df.copy(), text_field, label_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), text_field, label_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), text_field, label_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

In [66]:
TEXT_FIELD = torchtext.data.Field(lower=True, tokenize="spacy")
LABEL_FIELD = torchtext.data.Field(sequential=False)

In [61]:
train_ds, val_ds, test_ds = DataFrameDataset.splits(text_field=TEXT_FIELD, 
                                                    label_field=LABEL_FIELD, train_df=train_df, val_df=val_df, test_df=df_test)

In [62]:
TEXT_FIELD.build_vocab(train_ds)

In [63]:
TEXT_FIELD.vocab

<torchtext.vocab.Vocab at 0x7f37761394a8>

In [64]:
LABEL_FIELD.build_vocab(train_ds)

In [65]:
LABEL_FIELD.vocab.stoi

defaultdict(<function torchtext.vocab._default_unk_index()>,
            {'<unk>': 0, 'negative': 1, 'positive': 2})

In [66]:
next(iter(train_ds)).text

['@americanair',
 'why',
 'am',
 'i',
 'continually',
 'getting',
 'put',
 'on',
 'hold',
 'by',
 'painfully',
 'inexperienced',
 'people',
 'when',
 'calling',
 'your',
 'platinum',
 'desk',
 '?',
 '!']

In [67]:
train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train_ds, val_ds, test_ds), batch_sizes=(16, 256, 256), sort_key=lambda x: len(x.text))

In [68]:
train_batchtrain_ba  = next(iter(train_iter))

In [69]:
train_batchtrain_ba .text.shape

torch.Size([36, 16])

In [70]:
PATH = Path('./data/global-sentiment')
PATH.mkdir(exist_ok=True)
bs = 16                        

In [71]:
text_data = fastai.nlp.TextData.from_splits(PATH, (train_ds, val_ds, test_ds), bs=bs, text_name='text', label_name='label')

In [72]:
train_batch, y = next(iter(text_data.trn_dl))

In [73]:
text_data.trn_dl.src

<torchtext.data.iterator.BucketIterator at 0x7f3774ec27f0>

In [74]:
train_batch.shape

torch.Size([36, 16])

In [75]:
train_batch, y = next(iter(text_data.trn_dl))

In [76]:
train_batch.shape

torch.Size([34, 16])

# Using RNN

In [77]:
bs = 64
bptt = 70
em_sz = 200
nh = 500
nl = 3

In [78]:
opt_fn = partial(torch.optim.Adam, betas=(0.7, 0.99))
learner = text_data.get_model(opt_fn, 1500, bptt, emb_sz=em_sz,n_hid=nh, n_layers=nl,
                             dropout=0.1, dropouti=0.4,wdrop=0.5, dropoute=0.05, dropouth=0.3)

In [79]:
learner

SequentialRNN(
  (0): MultiBatchRNN(
    (encoder): Embedding(7748, 200, padding_idx=1)
    (encoder_with_dropout): EmbeddingDropout(
      (embed): Embedding(7748, 200, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDrop(
        (module): LSTM(200, 500)
      )
      (1): WeightDrop(
        (module): LSTM(500, 500)
      )
      (2): WeightDrop(
        (module): LSTM(500, 200)
      )
    )
    (dropouti): LockedDropout(
    )
    (dropouths): ModuleList(
      (0): LockedDropout(
      )
      (1): LockedDropout(
      )
      (2): LockedDropout(
      )
    )
  )
  (1): PoolingLinearClassifier(
    (layers): ModuleList(
      (0): LinearBlock(
        (lin): Linear(in_features=600, out_features=3, bias=True)
        (drop): Dropout(p=0.1)
        (bn): BatchNorm1d(600, eps=1e-05, momentum=0.1, affine=True)
      )
    )
  )
)

In [80]:
learner.reg_fn =partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3

In [81]:
learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2, metrics=[accuracy])

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                    
    0      0.399485   0.387968   0.848259  
    1      0.341437   0.275574   0.885938                    
    2      0.205169   0.281269   0.889286                    
    3      0.227581   0.245306   0.903929                    
    4      0.14264    0.237205   0.903125                    
    5      0.075634   0.289302   0.904464                     
    6      0.073863   0.32075    0.902188                     
    7      0.139736   0.307574   0.90567                      
    8      0.09699    0.350185   0.899375                     
    9      0.061942   0.329287   0.905357                     
    10     0.059824   0.350862   0.913839                     
    11     0.040136   0.401867   0.914821                     
    12     0.039077   0.378826   0.907768                     
    13     0.033449   0.410071   0.908929                     
    14     0.037424   0.418922   0.910223                     



[array([0.41892]), 0.9102232142857143]

In [82]:
learner.save_encoder('enc1')

In [83]:
learner.save('model_adam1')

### Run on Test Data

In [84]:
df_test.head()

Unnamed: 0,id,text,airline,tweet_location,user_timezone
0,0,@USAirways if one with @AmericanAir why can't ...,US Airways,,
1,1,@VirginAmerica You'd think paying an extra $10...,Virgin America,San Diego,Alaska
2,2,"@united according to your DMs, I'm not owed a ...",United,Nottingham,London
3,3,"@USAirways booked an award ticket recently, no...",US Airways,USA,Eastern Time (US & Canada)
4,4,@JetBlue Awesome! #bestairlineever,Delta,NYC,Quito


In [107]:
learner.data.test_dl.src.sort = False
learner.data.test_dl.src.sort_within_batch = False
learner.data.test_dl.src.shuffle = False

In [108]:
probs = learner.predict(is_test=True,)

In [109]:
preds = np.argmax(probs, axis=1)

In [110]:
PATH = "data/ass2"
SUBM = f'{PATH}/subm/'
os.makedirs(SUBM, exist_ok=True)

In [111]:
pd.DataFrame({
    'id': df_test['id'],
    'sentiment': [LABEL_FIELD.vocab.itos[p] for p in preds]}).to_csv(f'{SUBM}sub_ass2.csv', index=False)

In [112]:
FileLink(f'{SUBM}sub_ass2.csv')

In [113]:
!kaggle competitions submit -c high-flyers -f data/ass2/subm/sub_ass2.csv -m "Ngo Duy Vu Submission"

Successfully submitted to High Flyers

In [105]:
!pip install update kaggle

Collecting update
  Downloading https://files.pythonhosted.org/packages/9f/c4/dfe8a392edd35cc635c35cd3b20df6a746aacdeb39b685d1668b56bf819b/update-0.0.1-py2.py3-none-any.whl
Collecting style==1.1.0 (from update)
  Downloading https://files.pythonhosted.org/packages/4c/0b/6be2071e20c621e7beb01b86e8474c2ec344a9750ba5315886f24d6e7386/style-1.1.0-py2.py3-none-any.whl
Installing collected packages: style, update
Successfully installed style-1.1.0 update-0.0.1


# Improving Model

In [19]:
#https://www.kaggle.com/crowdflower/first-gop-debate-twitter-sentiment
#https://www.kaggle.com/c/football-sentiment/
#https://www.kaggle.com/crowdflower/twitter-airline-sentiment

In [50]:
fb_df = pd.read_csv('./data/datamining/football/train.csv')
fb_df = fb_df.loc[fb_df['polarity'].isin(["Positive","Negative"])]
fb_df = fb_df[['text', 'polarity']]
fb_df.rename(columns={'polarity': 'sentiment'}, inplace=True)
fb_df

Unnamed: 0,text,sentiment
2,@sport Y el barca que haria sin Messi????? Ni ...,Negative
7,@RoussGame Te as enterado que an robado en la ...,Negative
8,"Qué maravilla que este chico tocado por Dios, ...",Positive
9,@natxinho @jorgeneo @F_Reymundo totalmente! Aq...,Negative
14,@ngomez9 @FirmaEspn @EJerezESPN @kenneth_garay...,Negative
15,¡DE PIE DAMAS Y CABALLEROS! 💃🕴👏\n#HOY se cumpl...,Positive
16,@jotajordi13 Si..pero la liga y la copa os com...,Negative
18,Deportes Cuatro: SOIS UNA MIERDA\n\nEstán tach...,Negative
19,"⚽️🇪🇸| Barça\n\nEl partido de 'Dinho' ese día, ...",Positive
23,¡TREMENDO! 🙌 La brutal exhibición de Koke en e...,Positive


In [55]:
GOP_df = pd.read_csv('./data/datamining/GOP/Sentiment.csv')
GOP_df = GOP_df.loc[GOP_df['sentiment'].isin(["Positive","Negative"])]
GOP_df = GOP_df[['text', 'sentiment']]
GOP_df

Unnamed: 0,text,sentiment
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive
5,"RT @GregAbbott_TX: @TedCruz: ""On my first day ...",Positive
6,RT @warriorwoman91: I liked her and was happy ...,Negative
8,Deer in the headlights RT @lizzwinstead: Ben C...,Negative
9,RT @NancyOsborne180: Last night's debate prove...,Negative
10,@JGreenDC @realDonaldTrump In all fairness #Bi...,Negative
11,RT @WayneDupreeShow: Just woke up to tweet thi...,Positive
12,Me reading my family's comments about how grea...,Negative


In [58]:
AL_df = pd.read_csv('./data/datamining/airline/Tweets.csv')
AL_df = AL_df.loc[AL_df['airline_sentiment'].isin(["positive","negative"])]
AL_df = AL_df[['text', 'airline_sentiment']]
AL_df.rename(columns={'airline_sentiment': 'sentiment'}, inplace=True)
AL_df

Unnamed: 0,text,sentiment
1,@VirginAmerica plus you've added commercials t...,positive
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative
5,@VirginAmerica seriously would pay $30 a fligh...,negative
6,"@VirginAmerica yes, nearly every time I fly VX...",positive
8,"@virginamerica Well, I didn't…but NOW I DO! :-D",positive
9,"@VirginAmerica it was amazing, and arrived an ...",positive
11,@VirginAmerica I &lt;3 pretty graphics. so muc...,positive
12,@VirginAmerica This is such a great deal! Alre...,positive
13,@VirginAmerica @virginmedia I'm flying your #f...,positive


In [59]:
df_train = df_train[['text', 'sentiment']]
df_train

Unnamed: 0,text,sentiment
10095,@AmericanAir why am I continually getting put ...,negative
9986,@USAirways not happy!! Trying to get home on C...,negative
6117,"@united so you told me to go, knowing what the...",negative
7727,"@AmericanAir Also, I have to wait more than 2 ...",negative
3044,Thanks @united for writing back. To assist you...,negative
8628,@VirginAmerica Thanks for a great flight from ...,positive
8705,@USAirways Everyone on Flight 669 from LAX to ...,negative
4104,@united ours in July. You have ZERO excuses fo...,negative
8847,@united we needed them here asap. Will they ma...,negative
5522,@VirginAmerica sad to learn you no longer fly ...,negative


In [60]:
df_train = pd.concat([df_train, fb_df, GOP_df, AL_df], axis=0)
df_train

Unnamed: 0,text,sentiment
10095,@AmericanAir why am I continually getting put ...,negative
9986,@USAirways not happy!! Trying to get home on C...,negative
6117,"@united so you told me to go, knowing what the...",negative
7727,"@AmericanAir Also, I have to wait more than 2 ...",negative
3044,Thanks @united for writing back. To assist you...,negative
8628,@VirginAmerica Thanks for a great flight from ...,positive
8705,@USAirways Everyone on Flight 669 from LAX to ...,negative
4104,@united ours in July. You have ZERO excuses fo...,negative
8847,@united we needed them here asap. Will they ma...,negative
5522,@VirginAmerica sad to learn you no longer fly ...,negative


In [62]:
df_train.sentiment = df_train.sentiment.replace({'Positive': 'positive', 'Negative': 'negative'})
df_train.sentiment.value_counts(dropna=False)

negative    21990
positive     5905
Name: sentiment, dtype: int64

In [64]:
df_train = df_train.sample(frac=1, random_state=42)
train_df = df_train[:-2000]
val_df = df_train[-2000:]

In [67]:
train_ds, val_ds, test_ds = DataFrameDataset.splits(text_field=TEXT_FIELD, 
                                                    label_field=LABEL_FIELD, train_df=train_df, val_df=val_df, test_df=df_test)

In [68]:
TEXT_FIELD.build_vocab(train_ds)

In [69]:
LABEL_FIELD.build_vocab(train_ds)

In [70]:
train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train_ds, val_ds, test_ds), batch_sizes=(16, 256, 256), sort_key=lambda x: len(x.text))

In [71]:
train_batchtrain_ba  = next(iter(train_iter))
PATH = Path('./data/global-sentiment')
PATH.mkdir(exist_ok=True)
bs = 16   

In [72]:
text_data = fastai.nlp.TextData.from_splits(PATH, (train_ds, val_ds, test_ds), bs=bs, text_name='text', label_name='label')

In [73]:
train_batch, y = next(iter(text_data.trn_dl))

In [74]:
bs = 64
bptt = 70
em_sz = 200
nh = 500
nl = 3

In [75]:
opt_fn = partial(torch.optim.Adam, betas=(0.7, 0.99))
learner = text_data.get_model(opt_fn, 1500, bptt, emb_sz=em_sz,n_hid=nh, n_layers=nl,
                             dropout=0.1, dropouti=0.4,wdrop=0.5, dropoute=0.05, dropouth=0.3)

In [76]:
learner.reg_fn =partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3

In [77]:
learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2, metrics=[accuracy])

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                    
    0      0.376525   0.356404   0.872277  
    1      0.308557   0.278529   0.897098                    
    2      0.195429   0.294987   0.890804                    
    3      0.223205   0.266696   0.895134                    
    4      0.141248   0.322787   0.893125                    
    5      0.093998   0.340034   0.899152                     
    6      0.053451   0.317433   0.900402                     
    7      0.127379   0.336674   0.900268                    
    8      0.082241   0.39097    0.907098                     
    9      0.094157   0.306266   0.898884                     
    10     0.059137   0.433147   0.896518                     
    11     0.049313   0.487591   0.894777                     
    12     0.040594   0.493022   0.897589                     
    13     0.024843   0.513776   0.897455                     
    14     0.029232   0.4945     0.900402                     



[array([0.4945]), 0.9004017857142858]

In [78]:
learner.data.test_dl.src.sort = False
learner.data.test_dl.src.sort_within_batch = False
learner.data.test_dl.src.shuffle = False

In [79]:
probs = learner.predict(is_test=True,)

In [80]:
preds = np.argmax(probs, axis=1)

In [81]:
PATH = "data/ass2"
SUBM = f'{PATH}/subm/'
os.makedirs(SUBM, exist_ok=True)

In [82]:
pd.DataFrame({
    'id': df_test['id'],
    'sentiment': [LABEL_FIELD.vocab.itos[p] for p in preds]}).to_csv(f'{SUBM}sub_ass2.csv', index=False)

In [83]:
!kaggle competitions submit -c high-flyers -f data/ass2/subm/sub_ass2.csv -m "Ngo Duy Vu Submission"

Successfully submitted to High Flyers