###### importing important libraries including numpy, pandas and pyTorch

In [2]:
import numpy as np
import pandas as pd
import math

# for evaluating the model
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# PyTorch libraries and modules
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout
from torch.optim import Adam, SGD
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import itertools

###### Load the data, convert language labels into int form

In [5]:
df_train= pd.read_csv('/Users/nehakardam/Documents/UWclasses /EE511- Intro to Statistics/Neha_Assignment/HW5/train2.csv', names=['language', 'document'])
df_val= pd.read_csv('/Users/nehakardam/Documents/UWclasses /EE511- Intro to Statistics/Neha_Assignment/HW5/val2.csv', names=['language', 'document'])
df_test= pd.read_csv('/Users/nehakardam/Documents/UWclasses /EE511- Intro to Statistics/Neha_Assignment/HW5/test2.csv', names=['language', 'document'])

In [6]:
language_map = {}
i = 0
for lan in df_train.language.unique():
    language_map[lan] = i
    i = i+1
df_train.language.replace(language_map, inplace=True)
df_val.language.replace(language_map, inplace=True)
df_test.language.replace(language_map, inplace=True)
language_map

{'es': 0,
 'en': 1,
 'pt': 2,
 'fr': 3,
 'ca': 4,
 'de': 5,
 'eu': 6,
 'it': 7,
 'gl': 8}

##### Creating a vocabulury and computing the perplexity for this distribution using the validation data.

cross_entropy: 5.0599491674909345 

perplexity: 33.35772898629834

In [7]:
def text_to_vocab(text_arr):
    v = {}
    all_freq = {} 
    for text in text_arr:
        for i in text: 
            if i in all_freq: 
                all_freq[i] += 1
            else: 
                all_freq[i] = 1
    v[spl_char] = 0
    for i in all_freq:
        if(all_freq[i] >= 10):
            v[i] = all_freq[i]
        else:
            v[spl_char] = v[spl_char] + all_freq[i]
    return v

spl_char = 'out-of-vocabulary'
vocab = {}

train_tweets = []
for i in range(df_train.shape[0]):
    text = df_train.document[i]
    train_tweets.append(text)

vocab = text_to_vocab(train_tweets)

relative_freq = {}
s = sum(vocab.values())
for x in vocab:
    relative_freq[x] = vocab[x]/s
    
val_tweets = []
for i in range(df_val.shape[0]):
    text = df_val.document[i]
    val_tweets.append(text)
    
logpx = []
for t in val_tweets:
    for x in t:
        if x in relative_freq:
            logpx.append(math.log2(relative_freq[x]))
        else:
            logpx.append(math.log2(relative_freq[spl_char]))

cross_entropy = (-1)*sum(logpx)/len(logpx)
perplexity = 2**cross_entropy

print('cross_entropy:', cross_entropy, '\nperplexity:', perplexity)

cross_entropy: 5.0599491674909345 
perplexity: 33.35772898629834


###### Converting text data into tensor sequenes

In [8]:
def index_of(tok):
    vocab_list = list(vocab.keys())
    if tok in vocab_list:
        return vocab_list.index(tok)
    else:
        return 0

train_tweets = df_train.document.str.slice(0, 280)
val_tweets = df_val.document.str.slice(0, 280)
train_vectorized_seqs = [[index_of(tok) for tok in seq]for seq in train_tweets]
val_vectorized_seqs = [[index_of(tok) for tok in seq]for seq in val_tweets]

train_seq_lengths = torch.LongTensor(list(map(len, train_vectorized_seqs)))
val_seq_lengths = torch.LongTensor(list(map(len, val_vectorized_seqs)))

train_seq_tensor = Variable(torch.zeros((len(train_vectorized_seqs), 280))).long()
for idx, (seq, seqlen) in enumerate(zip(train_vectorized_seqs, train_seq_lengths)):
    train_seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
    
val_seq_tensor = Variable(torch.zeros((len(val_vectorized_seqs), 280))).long()
for idx, (seq, seqlen) in enumerate(zip(val_vectorized_seqs, val_seq_lengths)):
    val_seq_tensor[idx, :seqlen] = torch.LongTensor(seq)

###### Perform embedding

In [15]:
embedding = nn.Embedding(len(vocab), 16)

train_x = embedding(train_seq_tensor)
val_x = embedding(val_seq_tensor)

train_y = torch.tensor(df_train.language.values)
val_y = torch.tensor(df_val.language.values)

In [18]:
train_x.shape

torch.Size([76875, 280, 16])

In [17]:
# prepping the training and validation set for CNN model
x_train, x_val = train_x.reshape(76875, 1, 280, 16), val_x.reshape(11128, 1, 280, 16)
y_train, y_val = Variable(train_y), Variable(val_y)

###### Define CNN model

In [8]:
class Net(Module):   
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 50, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(50, 100, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.drop_out = nn.Dropout()
        self.fc1 = nn.Linear(100 * 280, 1000)
        self.fc2 = nn.Linear(1000, 9)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.drop_out(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

# defining the model
model = Net()
# defining the optimizer
optimizer = Adam(model.parameters(), lr=0.07)
# defining the loss function
criterion = CrossEntropyLoss()
    
print(model)


def train(x_train, y_train):
    model.train()
    tr_loss = 0

    # clearing the Gradients of the model parameters
    optimizer.zero_grad()
    
    # prediction for training set
    output = model(x_train)

    # computing the training and validation loss
    loss = criterion(output, y_train)

    # computing the updated weights of all the model parameters
    loss.backward(retain_graph=True)
    optimizer.step()
    tr_loss = loss.item()

    # Track the accuracy
    total = y_train.shape[0]
    _, predicted = torch.max(output.data, 1)
    correct = (predicted == y_train).sum().item()

    print('Loss: {:.4f}, Accuracy: {:.2f}%'.format(tr_loss, (correct / total) * 100))
    
    return loss

Net(
  (layer1): Sequential(
    (0): Conv2d(1, 50, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer2): Sequential(
    (0): Conv2d(50, 100, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (drop_out): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=28000, out_features=1000, bias=True)
  (fc2): Linear(in_features=1000, out_features=9, bias=True)
)


###### Train the model

In [9]:
def batch(iterable, n=100):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

i = 0
# training the model    
for x_t, y_t in zip(batch(x_train), batch(y_train)):
    loss = train(x_t, y_t)
    loss.detach()

Loss: 2.1870, Accuracy: 14.00%
Loss: 37552.3438, Accuracy: 28.00%
Loss: 329501.4062, Accuracy: 1.00%
Loss: 27108.4883, Accuracy: 32.00%
Loss: 2259.7744, Accuracy: 21.00%
Loss: 4.6153, Accuracy: 24.00%
Loss: 4.4603, Accuracy: 34.00%
Loss: 5.1571, Accuracy: 32.00%
Loss: 2.3933, Accuracy: 22.00%
Loss: 4.6724, Accuracy: 16.00%
Loss: 4.0023, Accuracy: 32.00%
Loss: 3.2943, Accuracy: 38.00%
Loss: 2.5801, Accuracy: 30.00%
Loss: 3.6233, Accuracy: 36.00%
Loss: 4.2683, Accuracy: 28.00%
Loss: 2.1008, Accuracy: 40.00%
Loss: 4.4762, Accuracy: 33.00%
Loss: 2.5481, Accuracy: 36.00%
Loss: 4.0940, Accuracy: 27.00%
Loss: 4.9378, Accuracy: 34.00%
Loss: 5.2338, Accuracy: 32.00%
Loss: 3.8575, Accuracy: 27.00%
Loss: 3.7253, Accuracy: 26.00%
Loss: 2.1043, Accuracy: 41.00%
Loss: 3.1715, Accuracy: 28.00%
Loss: 4.5826, Accuracy: 32.00%
Loss: 2.7347, Accuracy: 33.00%
Loss: 4.3988, Accuracy: 35.00%
Loss: 3.1269, Accuracy: 40.00%
Loss: 2.7172, Accuracy: 31.00%
Loss: 3.8647, Accuracy: 36.00%
Loss: 5.0039, Accuracy: 

KeyboardInterrupt: 