In [1]:
import numpy as np
import pandas as pd
import math

# for evaluating the model
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# PyTorch libraries and modules
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout
from torch.optim import Adam, SGD
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import itertools

# empty list to store training losses
train_losses = []
# empty list to store validation losses
val_losses = []
    
def text_to_vocab(text_arr):
    v = {}
    all_freq = {} 
    for text in text_arr:
        for i in text: 
            if i in all_freq: 
                all_freq[i] += 1
            else: 
                all_freq[i] = 1
    v[spl_char] = 0
    for i in all_freq:
        if(all_freq[i] >= 10):
            v[i] = all_freq[i]
        else:
            v[spl_char] = v[spl_char] + all_freq[i]
    return v

class Net(Module):   
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 50, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(50, 100, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.drop_out = nn.Dropout()
        self.fc1 = nn.Linear(280 * 50, 1000)
        self.fc2 = nn.Linear(1000, 9)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.drop_out(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

# defining the model
model = Net()
# defining the optimizer
optimizer = Adam(model.parameters(), lr=0.07)
# defining the loss function
criterion = CrossEntropyLoss()
    
print(model)

def train(x_train, y_train):
    model.train()
    tr_loss = 0

    # clearing the Gradients of the model parameters
    optimizer.zero_grad()
    
    # prediction for training and validation set
    output_train = model(x_train)

    # computing the training and validation loss
    loss_train = criterion(output_train, y_train)
    train_losses.append(loss_train)

    # computing the updated weights of all the model parameters
    loss_train.backward(retain_graph=True)
    optimizer.step()
    tr_loss = loss_train.item()
    return tr_loss
    
    
def index_of(tok):
    vocab_list = list(vocab.keys())
    if tok in vocab_list:
        return vocab_list.index(tok)
    else:
        return 0

Net(
  (layer1): Sequential(
    (0): Conv2d(1, 50, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer2): Sequential(
    (0): Conv2d(50, 100, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (drop_out): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=14000, out_features=1000, bias=True)
  (fc2): Linear(in_features=1000, out_features=9, bias=True)
)


In [2]:
df_train= pd.read_csv('/Users/mukulj/Downloads/Neha Project/Assignment5/train.csv', names=['language', 'document'])
df_val= pd.read_csv('/Users/mukulj/Downloads/Neha Project/Assignment5/val.csv', names=['language', 'document'])
df_test= pd.read_csv('/Users/mukulj/Downloads/Neha Project/Assignment5/test.csv', names=['language', 'document'])

In [3]:
language_map = {}
i = 0
for lan in df_train.language.unique():
    language_map[lan] = i
    i = i+1
df_train.language.replace(language_map, inplace=True)
df_val.language.replace(language_map, inplace=True)
df_test.language.replace(language_map, inplace=True)
language_map

{'es': 0,
 'en': 1,
 'pt': 2,
 'fr': 3,
 'ca': 4,
 'de': 5,
 'eu': 6,
 'it': 7,
 'gl': 8}

In [4]:
spl_char = 'out-of-vocabulary'
vocab = {}

train_tweets = []
for i in range(df_train.shape[0]):
    text = df_train.document[i]
    train_tweets.append(text)

vocab = text_to_vocab(train_tweets)

relative_freq = {}
s = sum(vocab.values())
for x in vocab:
    relative_freq[x] = vocab[x]/s
    
val_tweets = []
for i in range(df_val.shape[0]):
    text = df_val.document[i]
    val_tweets.append(text)
    
logpx = []
for t in val_tweets:
    for x in t:
        if x in relative_freq:
            logpx.append(math.log2(relative_freq[x]))
        else:
            logpx.append(math.log2(relative_freq[spl_char]))

cross_entropy = (-1)*sum(logpx)/len(logpx)
perplexity = 2**cross_entropy

print('cross_entropy:', cross_entropy, '\nperplexity:', perplexity)

cross_entropy: 5.0599491674909345 
perplexity: 33.35772898629834


In [5]:
train_tweets = df_train.document.str.slice(0, 280)
val_tweets = df_val.document.str.slice(0, 280)
train_vectorized_seqs = [[index_of(tok) for tok in seq]for seq in train_tweets]
val_vectorized_seqs = [[index_of(tok) for tok in seq]for seq in val_tweets]

# get the length of each seq in your batch
train_seq_lengths = torch.LongTensor(list(map(len, train_vectorized_seqs)))
val_seq_lengths = torch.LongTensor(list(map(len, val_vectorized_seqs)))

# dump padding everywhere, and place seqs on the left.
# NOTE: you only need a tensor as big as your longest sequence
train_seq_tensor = Variable(torch.zeros((len(train_vectorized_seqs), 280))).long()
for idx, (seq, seqlen) in enumerate(zip(train_vectorized_seqs, train_seq_lengths)):
    train_seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
    
val_seq_tensor = Variable(torch.zeros((len(val_vectorized_seqs), 280))).long()
for idx, (seq, seqlen) in enumerate(zip(val_vectorized_seqs, val_seq_lengths)):
    val_seq_tensor[idx, :seqlen] = torch.LongTensor(seq)

In [6]:
embedding = nn.Embedding(len(vocab), 10)

train_x = embedding(train_seq_tensor)
val_x = embedding(val_seq_tensor)

train_y = torch.tensor(df_train.language.values)
val_y = torch.tensor(df_val.language.values)

In [7]:
# getting the training set
x_train, x_val = train_x.reshape(76875, 1, 280, 10), val_x.reshape(11128, 1, 280, 10)
# getting the validation set
y_train, y_val = Variable(train_y), Variable(val_y)

In [None]:
def batch(iterable, n=100):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]
i = 0
# training the model    
for x_t, y_t in zip(batch(x_train), batch(y_train)):
    print('batch', i, train(x_t, y_t))
    i = i+1

batch 0 2.2527060508728027
batch 1 5570.65869140625
batch 2 128577.1875
batch 3 489.87872314453125
batch 4 6.632012367248535
