# Part 2. Twittern Airlines sentiment Analysis 

In [57]:
from __future__ import print_function
import sys
%matplotlib inline
import pandas as pd
import itertools
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
import re
import random
import pickle
import torch
import copy
from torchtext import data
from torchtext import datasets
from torch.optim import Adam, lr_scheduler
import torch.optim as optim

### Dataloading and preprocessing

In [58]:
## new dataframe with labels, text, sentiment label only
df= pd.read_csv("text_air.csv",index_col=[0]) 

In [76]:
text_token= data.Field(tokenize='spacy')
sentiment_label = data.LabelField(dtype=torch.float)

df = data.TabularDataset(path='text_air.csv',
                        format='csv', 
                        fields=[('Unnamed', None),("text",text_token),\
                                ("airline_sentiment",sentiment_label)],
                        skip_header=True)

########## split the data into train, valid and test sample
train_df, test_data = df.split(random_state=random.seed(19))
train_data, valid_data = train_df.split(random_state=random.seed(10))

###########Building the vocab
from torchtext import vocab
text_token.build_vocab(train_data, valid_data, max_size=100000, vectors="glove.6B.100d")
sentiment_label.build_vocab(train_data)

Implementing the average and max pool 2d. The word is embedded into a 2-dimensional grid, 
where the words are along one axis and the dimensions of the word embeddings are along the other. 

### Model definition

In [77]:
import torch.nn as nn
import torch.nn.functional as F

class Embednet(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_out):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim*2, n_out)
        
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.permute(1, 0, 2)
        avg_pooled = F.max_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) 
        max_pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) 
        x=self.fc(torch.cat([avg_pooled,max_pooled],dim=1))    
        return x

In [78]:
vocab_size = len(text_token.vocab)
embedding_dim = 120
n_out = 3
step_size=0.0005
decay=.9
model = Embednet(vocab_size, embedding_dim, n_out)

#pretrained_embeddings = text_token.vocab.vectors
#model.embedding.weight.data.copy_(pretrained_embeddings)


optimizer = optim.Adam(model.parameters(),lr=step_size)
scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=decay)
criterion=F.cross_entropy


In [79]:
# BucketIterator returns a batch object
train_batch, valid_batch, test_batch = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=128,sort_within_batch=False,sort=False,
    device=None)

In [80]:
best_model_state_dict = copy.deepcopy(model.state_dict()) ## saves the best model weights
def callback(epoch):
    best_score = -np.inf   ### keeps track of valid accuracy
    model.eval()

    offset = 0; valid_loss = 0
    valid_acc=0
    for entry in valid_batch:
        offset+=1
        pred = model(entry.text).squeeze(1)
        entry.airline_sentiment = entry.airline_sentiment.type(torch.LongTensor)
        loss = criterion(pred, entry.airline_sentiment)
        valid_loss += loss.item()
        pred = pred.data.max(1)[1]
        valid_acc += float(pred.eq(entry.airline_sentiment.data).sum())

    print("valid_loss:", valid_loss / len(valid_batch))
    print("valid_accuracy:", valid_acc / len(valid_batch))
    
    if valid_acc > best_score:
       best_score=valid_acc
       best_model_state_dict[0] = copy.deepcopy(model.state_dict())
       with open("model.pt", 'wb') as f:
            torch.save(best_model_state_dict[0], f)



### Training

In [81]:
epochs=10
iteration =1
import time
train_loss_epoch=[]
for epoch in range(epochs):
    t0 = time.time()
    print("epoch started")
    print("---------------------------")
    print("epoch = %d" % epoch)
    #print("step_size = %.4f" % step_size)
    train_loss=0.0
    
    for batch in train_batch:
        iteration += 1
        model.train()
        optimizer.zero_grad()
        pred= model(batch.text).squeeze(1)
        batch.airline_sentiment = batch.airline_sentiment.type(torch.LongTensor)
        loss = criterion(pred, batch.airline_sentiment)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
        t1 = time.time() 
    callback(epoch)
    print("Epoch took {} seconds".format(t1-t0)) 
    print("train_loss:",train_loss / len(train_batch))
    train_loss_epoch.append([epoch,train_loss / len(train_batch)])
    print()
    scheduler.step()
    step_size = step_size * decay

epoch started
---------------------------
epoch = 0
valid_loss: 0.8938344144821166
valid_accuracy: 76.6
Epoch took 1.9358348846435547 seconds
train_loss: 0.9245321708813048

epoch started
---------------------------
epoch = 1
valid_loss: 0.8653124618530273
valid_accuracy: 77.64
Epoch took 1.8969309329986572 seconds
train_loss: 0.837692732350868

epoch started
---------------------------
epoch = 2
valid_loss: 0.8591062521934509
valid_accuracy: 77.88
Epoch took 1.8548369407653809 seconds
train_loss: 0.8227043360994574

epoch started
---------------------------
epoch = 3
valid_loss: 0.8374351716041565
valid_accuracy: 78.96
Epoch took 1.8116416931152344 seconds
train_loss: 0.803349050513485

epoch started
---------------------------
epoch = 4
valid_loss: 0.8305393934249878
valid_accuracy: 80.0
Epoch took 1.8106989860534668 seconds
train_loss: 0.7950444692059567

epoch started
---------------------------
epoch = 5
valid_loss: 0.8165968751907349
valid_accuracy: 80.08
Epoch took 1.83610177040

### Testing model performance:

In [82]:
with open('model.pt', 'rb') as f:
     state_dict = torch.load(f,map_location='cpu')

In [83]:
model = Embednet(vocab_size, embedding_dim, n_out)
model.load_state_dict(state_dict)
model.eval()

Embednet(
  (embedding): Embedding(15475, 120)
  (fc): Linear(in_features=240, out_features=3, bias=True)
)

In [84]:
test_acc=0.0
for entry in test_batch:
    pred = model(entry.text).squeeze(1)
    entry.airline_sentiment = entry.airline_sentiment.type(torch.LongTensor)
    pred = pred.data.max(1)[1]
    test_acc += float(pred.eq(entry.airline_sentiment.data).sum())
print(" We obtain test accuracy:",test_acc / len(test_batch))

 We obtain test accuracy: 84.28571428571429
