### Todos:
### 1. preprocess the data to make it suitable for glove embeding:
### a. remove ' from words( don't -> dont)
### b. convert all letters to lowercase ( Hoping -> hoping)
### 2. find the frequency of each emoji to see if the dataset is balanced and balance the dataset(important)

In [1]:
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
import numpy as np
import matplotlib.pyplot as plt
import interesting_labels

In [2]:
import pandas

In [3]:
def get_data():
    header = ['status_id','tweet','label']
    data_set = pandas.read_csv('processed.txt',delimiter='\t',names = header)
    return data_set

def get_data_set():
    return global_data_set

def get_glove_embedding(glove): 
    return nn.Embedding.from_pretrained(glove.vectors)

def get_label_mapping():
    original_list = interesting_labels.wanted_list
    output_list = [i for i in range(len(original_list))]
    return dict(zip(original_list,output_list))
    


In [4]:
mapping = get_label_mapping()

In [5]:
global_data_set = get_data()

In [6]:
glove = torchtext.vocab.GloVe(name="twitter.27B",dim=50)

In [7]:
glove_emb = get_glove_embedding(glove)

In [8]:
data_set = get_data_set()

In [9]:
len(data_set)

34596

In [10]:
type(data_set['tweet'][0])

str

In [11]:
data_set

Unnamed: 0,status_id,tweet,label
0,742407819496919041,Hoping I don't screw up this interview,1381
1,744101567981359105,I feel like a baby kangaroo stuck in it's moth...,1421
2,746426733834944512,girl ppl should be happy i even remembered her...,1381
3,743951261607690240,"Oh, the irony if Misha wins the Choice TV Scen...",1387
4,747298371967193088,"I miss you to , you so fake now",1397
5,743505228759187457,ever since I laid my eyes on Mikayla I knew sh...,1445
6,744146216263499776,i know,1420
7,744526330964762624,Bacolod please?!,1403
8,744327078019891201,ugh six months is a long time.,1447
9,747879438981947393,Things can change so quickly,1424


In [12]:
data_set['label']=data_set['label'].map(mapping)

In [13]:
data_set

Unnamed: 0,status_id,tweet,label
0,742407819496919041,Hoping I don't screw up this interview,4
1,744101567981359105,I feel like a baby kangaroo stuck in it's moth...,44
2,746426733834944512,girl ppl should be happy i even remembered her...,4
3,743951261607690240,"Oh, the irony if Misha wins the Choice TV Scen...",10
4,747298371967193088,"I miss you to , you so fake now",20
5,743505228759187457,ever since I laid my eyes on Mikayla I knew sh...,68
6,744146216263499776,i know,43
7,744526330964762624,Bacolod please?!,26
8,744327078019891201,ugh six months is a long time.,70
9,747879438981947393,Things can change so quickly,47


In [14]:
def split_tweet(tweet):
    tweet = tweet.replace(".", " . ") \
                 .replace(",", " , ") \
                 .replace(";", " ; ") \
                 .replace("?", " ? ")
    return tweet.split()

In [15]:
def get_tweet_words(glove_vector):
    train, valid, test = [],[],[]
    data_set = get_data_set()
    for i in range(len(data_set)):
        tweet = data_set['tweet'][i]
        if(type(tweet) != str):
            continue
        idxs = [glove_vector.stoi[w]        # lookup the index of word
                for w in split_tweet(tweet)
                if w in glove_vector.stoi] # keep words that has an embedding
        if not idxs: # ignore tweets without any word with an embedding
            continue
        idxs = torch.tensor(idxs) # convert list to pytorch tensor
        label = torch.tensor(data_set['label'][i]).long()
        if i % 5 < 3:
            train.append((idxs, label))
        elif i % 5 == 4:
            valid.append((idxs, label))
        else:
            test.append((idxs, label))
    return train, valid, test

In [38]:
train, valid, test = get_tweet_words(glove)

In [39]:
train[0]

(tensor([6456,   85,   53, 2706]), tensor(4))

In [40]:
valid[0]

(tensor([ 292,   15,   16,    4,   15,   55, 1271,  110]), tensor(20))

In [41]:
len(train)

16097

In [42]:
tweet_emb = glove_emb(train[0][0])
tweet_emb.shape

torch.Size([4, 50])

### Recurrent Neural Network Module

In [43]:
rnn_layer = nn.RNN(input_size=50,    # dimension of the input repr
                   hidden_size=50,   # dimension of the hidden units
                   batch_first=True) # input format is [batch_size, seq_len, repr_dim]

In [44]:
tweet_input = tweet_emb.unsqueeze(0) # add the batch_size dimension
h0 = torch.zeros(1, 1, 50)     # initial hidden layer
out, last_hidden = rnn_layer(tweet_input, h0)

In [45]:
print(out.shape)
print(last_hidden.shape)

torch.Size([1, 4, 50])
torch.Size([1, 1, 50])


In [46]:
out[:,-1,:]

tensor([[-0.5889,  0.2290, -0.0355, -0.0720,  0.2809, -0.7359,  0.5537,  0.4149,
         -0.4108,  0.4489,  0.3068, -0.1907, -0.0307, -0.1738, -0.7307,  0.3846,
          0.3623,  0.5471,  0.3193,  0.1727, -0.2341, -0.1899, -0.6437, -0.3374,
         -0.5755,  0.2356, -0.0686, -0.0420, -0.0940, -0.0116,  0.3505, -0.3592,
          0.6256,  0.8290, -0.1017,  0.3457,  0.3558,  0.4042,  0.0972, -0.4353,
         -0.5523,  0.1998, -0.2601, -0.6284, -0.3841,  0.5604, -0.4865, -0.6599,
         -0.3195, -0.0674]], grad_fn=<SliceBackward>)

### Building the model

In [47]:
class TweetRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(TweetRNN, self).__init__()
        self.emb = nn.Embedding.from_pretrained(glove.vectors)
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        # Look up the embedding
        x = self.emb(x)
        # Set an initial hidden state
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        # Forward propagate the RNN
        out, _ = self.rnn(x, h0)
        # Pass the output of the last time step to the classifier
        out = self.fc(out[:, -1, :])
        return out

model = TweetRNN(input_size=50, hidden_size=50, num_classes=len(interesting_labels.wanted_list))

In [48]:
data_set.nunique()

status_id    25977
tweet        25454
label           78
dtype: int64

In [49]:
train[0:16097][1]

(tensor([  264,    63,    11,   347, 46998,  2457,    35, 50306]), tensor(44))

In [50]:
len(set([element[1].item() for element in train]))

78

In [51]:
import random

class TweetBatcher:
    def __init__(self, tweets, batch_size=32, drop_last=False):
        # store tweets by length
        self.tweets_by_length = {}
        for words, label in tweets:
            # compute the length of the tweet
            wlen = words.shape[0]
            # put the tweet in the correct key inside self.tweet_by_length
            if wlen not in self.tweets_by_length:
                self.tweets_by_length[wlen] = []
            self.tweets_by_length[wlen].append((words, label),)
         
        #  create a DataLoader for each set of tweets of the same length
        self.loaders = {wlen : torch.utils.data.DataLoader(
                                    tweets,
                                    batch_size=batch_size,
                                    shuffle=True,
                                    drop_last=drop_last) # omit last batch if smaller than batch_size
            for wlen, tweets in self.tweets_by_length.items()}
        
    def __iter__(self): # called by Python to create an iterator
        # make an iterator for every tweet length
        iters = [iter(loader) for loader in self.loaders.values()]
        while iters:
            # pick an iterator (a length)
            im = random.choice(iters)
            try:
                yield next(im)
            except StopIteration:
                # no more elements in the iterator, remove it
                iters.remove(im)

In [52]:
for i, (tweets, labels) in enumerate(TweetBatcher(train, drop_last=True)):
    print(tweets.shape, labels.shape)
print(i)

torch.Size([32, 5]) torch.Size([32])
torch.Size([32, 21]) torch.Size([32])
torch.Size([32, 23]) torch.Size([32])
torch.Size([32, 10]) torch.Size([32])
torch.Size([32, 7]) torch.Size([32])
torch.Size([32, 17]) torch.Size([32])
torch.Size([32, 18]) torch.Size([32])
torch.Size([32, 15]) torch.Size([32])
torch.Size([32, 9]) torch.Size([32])
torch.Size([32, 3]) torch.Size([32])
torch.Size([32, 23]) torch.Size([32])
torch.Size([32, 5]) torch.Size([32])
torch.Size([32, 9]) torch.Size([32])
torch.Size([32, 3]) torch.Size([32])
torch.Size([32, 25]) torch.Size([32])
torch.Size([32, 14]) torch.Size([32])
torch.Size([32, 11]) torch.Size([32])
torch.Size([32, 23]) torch.Size([32])
torch.Size([32, 24]) torch.Size([32])
torch.Size([32, 16]) torch.Size([32])
torch.Size([32, 24]) torch.Size([32])
torch.Size([32, 6]) torch.Size([32])
torch.Size([32, 17]) torch.Size([32])
torch.Size([32, 8]) torch.Size([32])
torch.Size([32, 21]) torch.Size([32])
torch.Size([32, 11]) torch.Size([32])
torch.Size([32, 18]) 

In [55]:
def get_accuracy(model, data_loader):
    correct, total = 0, 0
    for tweets, labels in data_loader:
        output = model(tweets)
        pred = output.max(1, keepdim=True)[1]
        correct += pred.eq(labels.view_as(pred)).sum().item()
        total += labels.shape[0]
    return float(correct) / float(total)

test_loader = TweetBatcher(test, batch_size=32, drop_last=False)
get_accuracy(model, test_loader)


0.01437908496732026

In [106]:
test_loader = TweetBatcher(test, batch_size=32, drop_last=True)

In [107]:
for tweets, labels in test_loader:
    print(tweets.shape)
    print(labels.shape)
    print(tweets[32])
    break

torch.Size([32, 10])
torch.Size([32])


IndexError: index 32 is out of bounds for dimension 0 with size 32

In [114]:
for i in range(100):
    test_loader = TweetBatcher(test, batch_size=32, drop_last=False)
    (get_accuracy(model, test_loader))


0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
