# Learn self-attention

In [50]:
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
np.set_printoptions(precision=2, suppress=True, linewidth=3000, threshold=20000)
from typing import Sequence
import nltk
from nltk.corpus import stopwords
import re
import string

dtype = torch.float
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [51]:
df = pd.read_csv("/Users/parrt/data/airline-tweets.csv")
df = df[['text','airline_sentiment']]
df['airline_sentiment'] = df['airline_sentiment'].map({'negative':0, 'neutral':1, 'positive':2})
df

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,1
1,@VirginAmerica plus you've added commercials t...,2
2,@VirginAmerica I didn't today... Must mean I n...,1
3,@VirginAmerica it's really aggressive to blast...,0
4,@VirginAmerica and it's a really big bad thing...,0
...,...,...
14635,@AmericanAir thank you we got on a different f...,2
14636,@AmericanAir leaving over 20 minutes Late Flig...,0
14637,@AmericanAir Please bring American Airlines to...,1
14638,"@AmericanAir you have my money, you change my ...",0


In [55]:
stops = set(stopwords.words("english"))     

def clean(text):
    text = text.lower()
    text = re.sub(r'^@[a-z]+', "", text) # delete @VirginAmerica at start
    text = re.sub(r'[^a-z]+', " ", text) # delete non-text
    words = text.split(" ")
    words = [w.strip() for w in words if w not in stops]
    return ' '.join(words)

In [56]:
df['text'] = df['text'].apply(clean)
df

Unnamed: 0,text,airline_sentiment
0,dhepburn said,1
1,plus added commercials experience tacky,2
2,today must mean need take another trip,1
3,really aggressive blast obnoxious entertainme...,0
4,really big bad thing,0
...,...,...
14635,thank got different flight chicago,2
14636,leaving minutes late flight warnings communic...,0
14637,please bring american airlines blackberry,1
14638,money change flight answer phones suggestions...,0


In [72]:
df['words'] = df['text'].apply(str.split)
df

Unnamed: 0,text,airline_sentiment,words
0,dhepburn said,1,"[dhepburn, said]"
1,plus added commercials experience tacky,2,"[plus, added, commercials, experience, tacky]"
2,today must mean need take another trip,1,"[today, must, mean, need, take, another, trip]"
3,really aggressive blast obnoxious entertainme...,0,"[really, aggressive, blast, obnoxious, enterta..."
4,really big bad thing,0,"[really, big, bad, thing]"
...,...,...,...
14635,thank got different flight chicago,2,"[thank, got, different, flight, chicago]"
14636,leaving minutes late flight warnings communic...,0,"[leaving, minutes, late, flight, warnings, com..."
14637,please bring american airlines blackberry,1,"[please, bring, american, airlines, blackberry]"
14638,money change flight answer phones suggestions...,0,"[money, change, flight, answer, phones, sugges..."


In [81]:
max_len = df['words'].apply(len).max() # max num words in a tweet
max_len

22

In [82]:
vocab = sorted(set(' '.join(df['text'].values).split()))
vocab[0:20]
wtoi = {w:i for i,w in enumerate(vocab)}

In [114]:
X = torch.zeros(size=[len(df),max_len])
for i,tweet in enumerate(df['words']):
    word_indexes = [wtoi[w] for w in tweet]
    pad = nn.ConstantPad1d((0,max_len-len(word_indexes)), 0)
    X[i,:] = pad(torch.tensor(word_indexes))

In [115]:
X

tensor([[ 2998., 10106.,     0.,  ...,     0.,     0.,     0.],
        [ 8827.,   140.,  2200.,  ...,     0.,     0.,     0.],
        [11802.,  7616.,  7189.,  ...,     0.,     0.,     0.],
        ...,
        [ 8799.,  1422.,   410.,  ...,     0.,     0.,     0.],
        [ 7485.,  1841.,  4223.,  ...,     0.,     0.,     0.],
        [ 8927.,  7730.,  6397.,  ...,     0.,     0.,     0.]])

In [29]:
X = X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [30]:
n = len(X_train)
char_embed_sz = 10
nhidden = 300
nclasses = len(vocab) # char output vocab

print(f"{n:,d} training records, {char_embed_sz} embedding size, {nclasses} target classes, state is {nhidden}-vector")

693 training records, 10 embedding size, 64 target classes, state is 300-vector


In [61]:
class Embedding:
    def __init__(self, input_size, embed_sz):
        self.E = torch.randn(embed_sz, input_size, device=device, dtype=torch.float64, requires_grad=True) # embedding
        self.input_size = input_size
        self.embed_sz = embed_sz
    def parameters(self): return [self.E]
    def __call__(self, x):
        if isinstance(x, int):
            return self.E[:,x].reshape(self.embed_sz, 1)
        # column E[i] is the embedding for char index i. same as multiple E.mm(onehot(i))
        return self.E[:,x]

In [62]:
class Linear:
    def __init__(self, input_size, output_size):
        self.V = torch.randn(output_size,  input_size, device=device, dtype=torch.float64, requires_grad=True)
        self.by = torch.zeros(output_size, 1,          device=device, dtype=torch.float64, requires_grad=True)
    def parameters(self): return [self.V, self.by]
    def __call__(self, h):
        o = self.V@h + self.by
        o = o.T # make it input_size x output_size
        return o

In [None]:
self.emby = Embedding(output_sz, output_embed_sz)
self.lin = Linear(nhidden, output_sz)
