In [1]:
import numpy as np 
import pandas as pd 
import torch
import torch.nn as nn 
import torch.nn.functional as F
from sklearn import metrics
import tensorflow as tf
from tqdm import tqdm
from torch.utils.data import Dataset, TensorDataset

In [2]:
MAX_LEN = 8
TRAIN_BATCH_SIZE = 128
VALID_BATCH_SIZE = 128
EPOCHS = 10
EMBEDDING_DIM = 100
device = torch.device("cuda")

In [3]:
train_path = "data/cleaned_data/testing/random_names_eval.csv"
eval_path = "data/cleaned_data/testing/random_names_eval.csv"
df_train = pd.read_csv(train_path,encoding='utf-8',usecols=[1,2])
df_train.dropna()
df_eval = pd.read_csv(eval_path,encoding='utf-8',usecols=[1,2])
df_eval.dropna()
df_train['name'].astype('str')
df_eval['name'].astype('str')
df_train['label'].astype('int')
df_eval['label'].astype('int')
X_train = df_train['name']
y_train = df_train['label']
X_test = df_eval['name']
y_test = df_eval['label']

In [4]:
class NameDataset(Dataset):
    def __init__(self, names, labels):
        self.names = names
        self.labels = labels
        
    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name = torch.tensor(self.names[idx]).type(torch.LongTensor)
        label = torch.tensor(self.labels[idx]).type(torch.LongTensor)
        return name, label


In [5]:
class LSTM(nn.Module):
    def __init__(self,embedding_matrix):
        super(LSTM,self).__init__()

        num_embeddings, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False

        self.lstm1 = nn.LSTM(embedding_dim, 32, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(64, 16, bidirectional=True, batch_first=True)  # 64 because it's bidirectional
        self.lstm3 = nn.LSTM(32, 8, bidirectional=True, batch_first=True)  # 32 because it's bidirectional

        self.dropout1 = nn.Dropout(0.3)
        self.fc1 = nn.Linear(16, 16)  # 16 because it's bidirectional

        self.dropout2 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(16, 8)

        self.dropout3 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(8, 1)


    def forward(self,x):
        x = self.embedding(x)
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x, _ = self.lstm3(x)
        x = self.dropout1(x)
        x = torch.relu(self.fc1(x))
        x = self.dropout2(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout3(x)
        x = torch.sigmoid(self.fc3(x))
        return x
        

In [6]:
def train(data_loader,model,optimizer,device):
    """
    This is the main training function that trains model
    for one epoch
    :param data_loader: this is the torchdataloader
    :param model: model(lstm model)
    :param optimizer: optimizer Adam, SGD etc
    :param device: this can be "cuda" or "cpu"
    """
    # set the model to training mode
    model.train()
    
    # go through the batches of data in data_loader:
    for data in tqdm(data_loader):
        names = data["names"]
        labels = data["labels"]
        # move the data to the device that we want to use
        names = names.to(device,dtype=torch.long)
        labels = labels.to(device,dtype=torch.float)
        # clear the gradients
        optimizer.zero_grad()
        # make predictions from the models
        predictions = model(names)
        # loss
        loss = nn.BCEWithLogitsLoss()(predictions,labels.view(-1,1))
        loss.backward()
        optimizer.step()
        
def evaluate(data_loader,model,device):
    final_predictions = []
    final_targets = []
    
    model.eval()
    for data in data_loader:
        names = data["names"]
        labels = data["labels"]
        # move the data to the device that we want to use
        names = names.to(device,dtype=torch.long)
        labels = labels.to(device,dtype=torch.float)
        predictions = model(names)
        predictions = predictions.detach().cpu().numpy().tolist()
        labels = data["labels"].detach().cpu().numpy().tolist()
        final_predictions.extend(predictions)
        final_targets.extend(labels)
        
        
    return  final_predictions,final_targets


In [7]:
def create_embedding_matrix(word_index,embedding_dict):
    """
    This function creates the embedding matrix
    :param word_index: a dictionary of word: index_value
    :param embedding_dict:
    :return a numpy array with embedding vectors for all known words
    """
    # intialize the embedding matrix 
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM ))
    for word, i in word_index.items():
        embedding_vector = embedding_dict.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [8]:
def run():

    print('Fitting tokenizer')
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(X_train)
    
    # oov_token = "<OOV>"
    # tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = MAX_LEN, oov_token=oov_token)
    # tokenizer.fit_on_texts(X_train)
    
    xtrain = tokenizer.texts_to_sequences(X_train)
    xtest = tokenizer.texts_to_sequences(X_test)

    xtrain = tf.keras.preprocessing.sequence.pad_sequences(xtrain,maxlen = MAX_LEN)
    xtest = tf.keras.preprocessing.sequence.pad_sequences(xtest,maxlen = MAX_LEN)
    
    train_dataset = NameDataset(names=xtrain,labels=df_train.label.values)
    train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=TRAIN_BATCH_SIZE,num_workers=2)

    valid_dataset = NameDataset(names=xtest,labels=df_eval.label.values)
    valid_loader = torch.utils.data.DataLoader(valid_dataset,batch_size=VALID_BATCH_SIZE,num_workers=2)
    
    return valid_loader 
    # print("Load embeddings")
    # embedding_dict = load_vectors('data/glove.6B.100d.txt')
    # embedding_matrix = create_embedding_matrix(tokenizer.word_index,embedding_dict)
    # # create a torch device since we are using cuda
    # 
    # print(device)
    
    # model = LSTM(embedding_matrix)
    # model.to(device)
    # optimizer = torch.optim.Adam(model.parameters(),lr = 1e-3)
    
    # print("Training model")
    # best_accuracy = 0
    # early_stopping_counter = 0
    # for epoch in range(1,EPOCHS+1):
    #     train(train_loader,model,optimizer,device)
    #     outputs,targets = evaluate(valid_loader,model,device)
    #     outputs = np.array(outputs)>0.5
    #     accuracy = metrics.accuracy_score(targets,outputs)
    #     print(f"EPOCH:{epoch}, Accuracy Score: {accuracy}")
    #     if accuracy>best_accuracy:
    #         best_accuracy = accuracy
    #     else:
    #         early_stopping_counter +=1
    #     if early_stopping_counter>2:
    #         break

In [9]:
val =run() 


Fitting tokenizer


In [10]:
val

<torch.utils.data.dataloader.DataLoader at 0x1c83ac53590>

In [11]:
batch = next(iter(val))