# FAKE DATA: Sentiment prediction with self-attention

Get encoding mechanism working.

In [1]:
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
np.set_printoptions(precision=2, suppress=True, linewidth=3000, threshold=20000)
from typing import Sequence
import nltk
from nltk.corpus import stopwords
import re
import string
import time
import csv

dtype = torch.float
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

## Fake glove data

In [2]:
df_glove = pd.DataFrame.from_records(
    [
        ['the', 0, 1, 2, 3],
        ['cat', 9, 2, 1, 4],
        ['and', 7, 0, 5, 5],
        ['dog', 6, 4, 7, 8]
    ]
)

In [3]:
def load_glove(df_glove):
    itow = [None] + list(df_glove.iloc[:,0].values) # map word index to word; index 0 for unknown word
    wtoi = {w:i for i,w in enumerate(itow)}         # map word to word index; 0 is invalid, 1 is first valid index
    vocab = set(itow)

    glove = df_glove.iloc[:,1:].values              # row glove[i,:] is word vector for word index i
    glove = np.insert(glove, 0, values=0, axis=0)   # make sure row 0 is for invalid / unknown / missing word
    return itow, wtoi, vocab, glove

In [4]:
itow, wtoi, vocab, glove = load_glove(df_glove)
itow, wtoi, vocab, glove

([None, 'the', 'cat', 'and', 'dog'],
 {None: 0, 'the': 1, 'cat': 2, 'and': 3, 'dog': 4},
 {None, 'and', 'cat', 'dog', 'the'},
 array([[0, 0, 0, 0],
        [0, 1, 2, 3],
        [9, 2, 1, 4],
        [7, 0, 5, 5],
        [6, 4, 7, 8]]))

In [30]:
d = glove.shape[1] # word vec len
d

4

## Fake tweet data

In [5]:
df_tweets = pd.DataFrame.from_records(
    [
        ['the cat and dog', 0],
        ['cat cat', 1],
        ['the dog and the cat', 1]
    ],
    columns = ['text', 'sentiment']
)
df_tweets

Unnamed: 0,text,sentiment
0,the cat and dog,0
1,cat cat,1
2,the dog and the cat,1


In [6]:
def clean(s):
    s = s.lower()
    return ' '.join([w for w in s.split() if w in vocab])

df_tweets['text'] = df_tweets['text'].apply(clean)
df_tweets.head()

Unnamed: 0,text,sentiment
0,the cat and dog,0
1,cat cat,1
2,the dog and the cat,1


In [7]:
max_len = df_tweets['text'].apply(str.split).apply(len).max() # max num words in a tweet
max_len

5

## Encode tweets as sequence of word indexes

Create matrix where X[i,j] is tweet i and word j within that tweet

In [8]:
def encode_words(df):
    X = torch.zeros(size=[len(df),max_len], dtype=int)
    for i,tweet in enumerate(df['text'].apply(str.split)):
        word_indexes = [wtoi[w] for w in tweet]
        pad = nn.ConstantPad1d((0,max_len-len(word_indexes)), 0) # pad on right
        X[i,:] = pad(torch.tensor(word_indexes))
    return X

In [9]:
X = encode_words(df_tweets)
X

tensor([[1, 2, 3, 4, 0],
        [2, 2, 0, 0, 0],
        [1, 4, 3, 1, 2]])

In [10]:
nclasses = len(df_tweets['sentiment'].unique())
y = df_tweets['sentiment']

## Create centroid word vector per tweet

In [41]:
def centroids(X):
    X_encoded = torch.zeros(size=[X.shape[0],d])
    for i,x in enumerate(X):
        wvecs = [glove[wi,:] for wi in x if wi>0]
        nw = len(wvecs)
        wvecs = torch.tensor(wvecs)
#         print(wvecs)
        vsum = torch.sum(wvecs, axis=0)
#         print(vsum)
        centroid = torch.sum(wvecs, axis=0) / nw
#         print(centroid)
        X_encoded[i,:] = centroid
    return X_encoded

In [42]:
X_encoded = centroids(X)
X_encoded

tensor([[5.5000, 1.7500, 3.7500, 5.0000],
        [9.0000, 2.0000, 1.0000, 4.0000],
        [4.4000, 1.6000, 3.4000, 4.6000]])