# Imports

In [1]:
import os
import sys
import time
import datetime 
import json
import numpy as np
import pandas as pd
import gc
import random
from matplotlib import pyplot as plt

import argparse
from collections import Counter


In [2]:
import torch

from torch import nn, optim
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR

# from model import Model
# from dataset import Dataset


# if torch.cuda.is_available():
#   device = torch.device('cuda:0') 
# #   torch.set_default_tensor_type('torch.cuda.FloatTensor')
#   torch.backends.cudnn.benchmark = True
# else:
#    device = torch.device('cpu')
# #    torch.set_default_tensor_type('torch.FloatTensor')

# print('Using device:', device)
print(torch.cuda.get_device_name(0))

GeForce RTX 2070 SUPER


# Input-Output

In [3]:
INPUT_FOLDER = "data/"

OUTPUT_FOLDER = "out/"
if not os.path.exists(OUTPUT_FOLDER):
    os.mkdir(OUTPUT_FOLDER)
    

In [4]:
INPUT_CSV_PATH = INPUT_FOLDER+"gender_encrypted_385500.csv"
TRAIN_CSV_PATH = INPUT_FOLDER+"gender_encrypted_385500_train.csv"
VALID_CSV_PATH = INPUT_FOLDER+"gender_encrypted_385500_valid.csv"
COL_NAME = "encrypted" # "tokenized", "encrypted"

In [5]:
# MODEL_FOLDER = COL_NAME
MODEL_FOLDER = "encrypted_Sentence_Single"

MODEL_OUTPUT_FOLDER = OUTPUT_FOLDER+MODEL_FOLDER+"/"
if not os.path.exists(MODEL_OUTPUT_FOLDER):
    os.mkdir(MODEL_OUTPUT_FOLDER)

In [6]:
STEREOSET_FOLDER = INPUT_FOLDER+"Stereoset_Gender_Data/"
EVALUATION_SENTENCES_PATH = STEREOSET_FOLDER+"sentences_{}.txt".format(COL_NAME)
EVALUATION_IDS_PATH = STEREOSET_FOLDER+"ids.txt"


In [7]:
DEV_DATA_PATH = STEREOSET_FOLDER+'stereoset-dev-gender-intersentence.json'
PREDICTION_SCORES_PATH = MODEL_OUTPUT_FOLDER+'stereoset_prediction_scores.json'
RESULTS_PATH = MODEL_OUTPUT_FOLDER+"results_{}.json".format(COL_NAME)

# Split Train-Valid

In [8]:
# VALID_LEN = 10000
# data_df = pd.read_csv(INPUT_CSV_PATH)
# values = data_df.values
# print(len(values))

In [9]:
# random.seed(datetime.datetime.now())
# val_idx = random.randrange(0, len(values)-VALID_LEN)

# valid_values = values[val_idx:val_idx+VALID_LEN]
# valid_df = pd.DataFrame(valid_values, columns = data_df.columns)
# valid_df.to_csv(VALID_CSV_PATH, index=False)
# print(len(valid_df))

# train_values = np.delete(values, range(val_idx, val_idx+VALID_LEN), axis=0)
# train_df = pd.DataFrame(train_values, columns = data_df.columns)
# train_df.to_csv(TRAIN_CSV_PATH, index=False)
# print(len(train_df))


# Tokenizer

In [10]:
class Tokenizer_VocabBuilder:
    def __init__(self, input_csv_path: str, data_col_name: str, vocab_size: int = 10000):
        self.input_csv_path = input_csv_path
        self.data_col_name = data_col_name
        self.vocab_size = vocab_size

        # input
        self.input_df = pd.read_csv(self.input_csv_path)
        self.input_sentences = self.input_df[data_col_name].values.tolist()
        self.num_examples = len(self.input_sentences)

        # initialize
        # self.tokenized_sentences = []     # list of strings

        # self.num_unique_tokens = 0
        # self.unique_tokens_df

        # self.vocab_list = []              # list of strings
        # self.token2idx = {}
        # self.idx2token = {}
        
    
    def tokenize_sentence(self, sentence: str): # can modify this function for more advanced tokenization
        sentence = str(sentence).strip()
        sentence = ' '.join(sentence.split())
        
        return sentence


    def tokenize_dataset(self):                 
        self.tokenized_sentences = []       # list of strings
        
        for sentence in self.input_sentences:
            self.tokenized_sentences.append(self.tokenize_sentence(sentence)) 

        assert self.num_examples == len(self.tokenized_sentences)

    
    
    
    def build_vocab(self):
        unique_tokens_dict = {}

        self.vocab_list = ['<pad>', '<unk>', '<sos>', '<eos>']              # list of strings

        for sentence in self.tokenized_sentences:
            for token in sentence.split():
                try:
                    current_count = unique_tokens_dict[token]
                    unique_tokens_dict[token] = current_count+1

                except KeyError:
                    unique_tokens_dict[token] = 1

        self.unique_tokens_df = pd.DataFrame(columns=['token', 'count'])
        self.unique_tokens_df['token'] = list(unique_tokens_dict.keys())
        self.unique_tokens_df['count'] = list(unique_tokens_dict.values())

        self.unique_tokens_df.sort_values(by=['count'], axis=0 , ascending=False, inplace=True, ignore_index=True)
        self.num_unique_tokens = len(self.unique_tokens_df)

        self.vocab_list.extend(self.unique_tokens_df['token'].values.tolist()[0:self.vocab_size-4]) 

        self.token2idx = {}
        self.idx2token = {}

        for idx in range(len(self.vocab_list)):
            self.token2idx[self.vocab_list[idx]] = idx
            self.idx2token[idx] = self.vocab_list[idx]


    def encode_sentence(self, sentence: str):
        token_idx_list = []

        sentence = self.tokenize_sentence(sentence)
        for token in sentence.split():
            try:
                token_idx_list.append(self.token2idx[token])
            
            except KeyError:
                token_idx_list.append(self.token2idx['<unk>'])

        return token_idx_list

    def decode_sentence(self, token_idx_list: list):
        word_list = []
        for token_idx in token_idx_list:
            try:
                word_list.append(self.idx2token[token_idx])
            
            except KeyError:
                word_list.append('<unk>')

        sentence = ' '.join(word_list)

        return sentence
    
tokenizer = Tokenizer_VocabBuilder(INPUT_CSV_PATH, COL_NAME, 8000)
tokenizer.tokenize_dataset()
tokenizer.build_vocab()

# Dataset for NN

In [11]:
def pad_sequences(x, max_len):
    padded = torch.ones((max_len), dtype=torch.long)
    if len(x) > max_len: padded[:] = torch.tensor(x[:max_len] , dtype=torch.long)
    else: padded[:len(x)] = torch.tensor(x, dtype=torch.long)
    return padded


class Dataset_Sentence_Concat(torch.utils.data.Dataset):
    def __init__(self, args, tokenizer, csv_path):
        
        self.args = args
        self.tokenizer = tokenizer
        self.csv_path = csv_path
        
        self.sentences = self.load_sentences()
        self.num_sentences = len(self.sentences)
        
        self.uniq_words = self.tokenizer.vocab_list
        #self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = self.tokenizer.token2idx

        #self.words_indexes = [self.word_to_index[w] for w in self.words]
        #self.words_indexes = self.tokenizer.encode_sentence(" ".join(self.words))

        
    def load_sentences(self):
        data_df = pd.read_csv(self.csv_path) # need to input this csv from init to create the test data set
        text = data_df[COL_NAME].str.cat(sep=' <eos> ')
        return text.split('<eos>') # this may be changed for niloys complex split function

    
    def __len__(self):
#         return len(self.words_indexes)//self.args["sequence_length"] - self.args["sequence_length"]
        return self.num_sentences


    def __getitem__(self, index):
        tokens = []
        id = index
        while (len(tokens) <= self.args["sequence_length"]):
            tokens.extend(self.tokenizer.encode_sentence(self.sentences[id]))
            id+=1
            
            if id > self.__len__()-1:
                id = 0
            
        tokens = tokens[:self.args["sequence_length"]]

        return (
            torch.tensor([2]+tokens[:-1]),
            torch.tensor(tokens),
        )
    

    
    
    
class Dataset_Sentence_Single(torch.utils.data.Dataset):
    def __init__(self, args, tokenizer, csv_path):
        
        self.args = args
        self.tokenizer = tokenizer
        self.csv_path = csv_path
        
        self.sentences = self.load_sentences()
        self.num_sentences = len(self.sentences)
        
        self.uniq_words = self.tokenizer.vocab_list
        #self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = self.tokenizer.token2idx

        #self.words_indexes = [self.word_to_index[w] for w in self.words]
        #self.words_indexes = self.tokenizer.encode_sentence(" ".join(self.words))

        
    def load_sentences(self):
        data_df = pd.read_csv(self.csv_path) # need to input this csv from init to create the test data set
        text = data_df[COL_NAME].str.cat(sep=' <eos> ')
        return text.split('<eos>') # this may be changed for niloys complex split function

    
    def __len__(self):
#         return len(self.words_indexes)//self.args["sequence_length"] - self.args["sequence_length"]
        return self.num_sentences


    def __getitem__(self, index):
        tokens = []
        tokens.extend(self.tokenizer.encode_sentence(self.sentences[index]))
                      
        pad_encoded = self.word_to_index['<pad>']
        while (len(tokens) < self.args["sequence_length"]):
            tokens.append(pad_encoded)
        
        tokens = tokens[0:self.args["sequence_length"]]
        return (
            torch.tensor([2]+tokens[:-1]),
            torch.tensor(tokens),
        )
    

    

    

# Neural Network

In [12]:
import torch
from torch import nn

class Model(nn.Module):
    def __init__(self, dataset):
        super(Model, self).__init__()
        self.lstm_size = 512
        self.embedding_dim = 300
        self.num_layers = 3

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.1, #this was 0.1 previously
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        #print(logits.size())
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size))

    
    



# train, calc and predict funtions

In [13]:
def calc_stereoset(model):
    sentence_file = open(EVALUATION_SENTENCES_PATH, 'r')
    sentences = sentence_file.readlines()
    sentence_file.close()
    
    id_file = open(EVALUATION_IDS_PATH, 'r')
    ids = id_file.readlines()
    id_file.close()
    
    assert len(sentences) == len(ids)
    
    out_list = []

    for idx in range(len(sentences)):
        line = sentences[idx].strip()
        sample_id = ids[idx].strip()
        
        #print(line)
        #sent = "<sos> i know you. You are a lair. i know you. You are a lair. i know you. You are a lair"
        sent = line.strip()
        tokens = [2]+tokenizer.encode_sentence(sent)
        model.eval()
        joint_sentence_probability = []
        state_h, state_c = model.init_state(len(tokens))
        state_h = state_h.to('cuda')
        state_c = state_c.to('cuda')
        
        x = torch.tensor([tokens]).to('cuda')
        
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))
        y_pred = y_pred.to('cuda')
#         state_h = state_h.to('cuda')
#         state_c = state_c.to('cuda')
        #print(y_pred.size())
        #return
        
        pred_start_pos = 1
        
#         words = line.split()
#         for word_idx in range(len(words)):
#             if words[word_idx].strip() == ".":
#                 pred_start_pos = word_idx+2  # 2 because of the <sos> we added
#                 break
        
        
        for i in range(pred_start_pos, len(tokens)):
            p = torch.nn.functional.softmax(y_pred[0][i-1], dim=0).detach().cpu().numpy()
            joint_sentence_probability.append(p[tokens[i]])

            score = np.sum([np.log2(i) for i in joint_sentence_probability]) 
            score /= len(joint_sentence_probability)
            score = np.power(2, score)
            
            new_dict = {}
            new_dict['id'] = sample_id
            new_dict['score'] = score
            out_list.append(new_dict)
    
    model.train() 
    
    out_dict = {}
    out_dict['intersentence'] = out_list
    out_dict['intrasentence'] = []
    
    with open(PREDICTION_SCORES_PATH, 'w') as outfile:
        json.dump(out_dict, outfile, indent = 2)
        outfile.close()

      
    
def calc_confidence(model, dataloader):
    print("Total number of batches = ", len(dataloader))
    model.eval()
    sm = 0
    cnt = 0
        
    state_h, state_c = model.init_state(args['sequence_length'])
    state_h = state_h.to('cuda')
    state_c = state_c.to('cuda')
    
    perp = []
    
    with torch.no_grad():
        for batch, (x, y) in enumerate(dataloader):
            print(datetime.datetime.now(), ":", 'batch :', batch)
            x = x.to('cuda')
            y = y.to('cuda')

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            
            #print(y_pred.size())
            #print(y_pred.transpose(1, 2).size())
            #print(y.size())
            perp_cal = []
            perp_cal.append(nn.CrossEntropyLoss()(y_pred.transpose(1, 2), y).detach().cpu())
            perp.append(torch.exp(torch.stack(perp_cal).sum()/len(perp_cal)))
            
            #y_pred = y_pred.to('cuda')
            #state_h = state_h.to('cuda')
            #state_c = state_c.to('cuda')

            #state_h = state_h.detach()
            #state_c = state_c.detach()

        #         print(y_pred.shape)

#             for idx in range(len(y_pred)):
#                 joint_sentence_probability = []
#                 for i in range(1, len(x[idx])):
#                     p = torch.nn.functional.softmax(y_pred[idx][i-1], dim=0).detach().cpu().numpy()
#                     joint_sentence_probability.append(p[x[idx][i]])
#                     score = np.sum([np.log2(i) for i in joint_sentence_probability]) 
#                     score /= len(joint_sentence_probability)
#                     score = np.power(2, score)
#                     sm+=score
#                     cnt+=1
                
    model.train()
    
#     confidence_score = sm/cnt
    confidence_score = 0
    print("Confidence Score = ", confidence_score)
    
    perplexity = float(sum(perp) / len(perp)) 
    print("Perplexity = ",  perplexity)
    #print("perplexity = ", torch.stack(perp).sum()/cnt)
    #print(" actual perplexity = ", torch.exp(torch.stack(perp).sum()/cnt))
    return confidence_score, perplexity



    
#     confidence_score = sm/cnt
    confidence_score = 0
    print("Confidence Score = ", confidence_score)
    
    perplexity = float(sum(perp) / len(perp)) 
    print("Perplexity = ",  perplexity)
    #print("perplexity = ", torch.stack(perp).sum()/cnt)
    #print(" actual perplexity = ", torch.exp(torch.stack(perp).sum()/cnt))
    return confidence_score, perplexity


def predict(model, text, next_words=100):
    words = [2]+tokenizer.encode_sentence(text)
    model.eval()
    out = words.copy()
    with torch.no_grad():
        state_h, state_c = model.init_state(len(words))
        state_h = state_h.to('cuda')
        state_c = state_c.to('cuda')

        for i in range(0, next_words):
            #x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]]).to('cuda')
            x = torch.tensor([words]).to('cuda')
            y_pred, (state_h, state_c) = model(x, (state_h, state_c))

            last_word_logits = y_pred[0][-1]
            p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
            word_index = np.random.choice(len(last_word_logits), p=p)
            out.append(word_index)
    model.train()
    print("generated sentence = ")
    print(tokenizer.decode_sentence(out))

def train(model, args, train_dataloader, valid_dataloader):
    model.train()

    print("total number of steps needed for a single epoch = ", len(train_dataloader))
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
#     optimizer = optim.Adam(model.parameters(), lr=2.3102423292463396e-07)
    scheduler = StepLR(optimizer, step_size=20, gamma=0.99)
    
    for epoch in range(LOADED_EPOCH+1, args['max_epochs']):
        state_h, state_c = model.init_state(args['sequence_length'])
        state_h = state_h.to('cuda')
        state_c = state_c.to('cuda')

        for batch, (x, y) in enumerate(train_dataloader):
            
            x = x.to('cuda')
            y = y.to('cuda')

            optimizer.zero_grad()

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            '''
            y_pred = y_pred.to('cuda')
            state_h = state_h.to('cuda')
            state_c = state_c.to('cuda')
            '''
            state_h = state_h.detach()
            state_c = state_c.detach()
            
            loss = criterion(y_pred.transpose(1, 2), y).cuda()

            loss.backward()
            optimizer.step()
            scheduler.step()
            
            #predict(model, text = " i know ")
            
            #valid_score = calc_confidence(model, valid_dataloader)
            
            print(datetime.datetime.now(), ":", { 'epoch': epoch, 'batch': batch, 'loss': loss.item(), 'lr':scheduler.get_lr()[0]})
          
        # end of epoch. generate figure and save model 
        predict(model, text = "he knows that ")
        print(datetime.datetime.now(), ":", "Generating Confidence Scores: " + str(epoch))
        epoch_list.append(epoch)
        
        train_c, train_p = calc_confidence(model, train_dataloader) # takes 6x time than training
        train_conf_list.append(train_c)
        train_perp_list.append(train_p)
        
        valid_c, valid_p = calc_confidence(model, valid_dataloader) 
        valid_conf_list.append(valid_c)
        valid_perp_list.append(valid_p)
        
        
        plt.clf()
        plt.plot(epoch_list, train_conf_list, color='red', linestyle='dashed')
        plt.plot(epoch_list, valid_conf_list, color='blue', linestyle='solid')

        plt.xlabel('Num Epoch')
        plt.ylabel('Conf_Score')
        plt.title('Conf_Score VS Epoch')
        plt.legend(['Train Conf', 'Valid Conf'], loc='upper right')
        plt.savefig(MODEL_OUTPUT_FOLDER+"Conf_Scores.png" , format='png', dpi=600)
        
        plt.clf()
        plt.plot(epoch_list, train_perp_list, color='red', linestyle='dashed')
        plt.plot(epoch_list, valid_perp_list, color='blue', linestyle='solid')

        plt.xlabel('Num Epoch')
        plt.ylabel('Perp')
        plt.title('Perp VS Epoch')
        plt.legend(['Train Perp', 'Valid Perp'], loc='upper right')
        plt.savefig(MODEL_OUTPUT_FOLDER+"Perp_Scores.png" , format='png', dpi=600)
        
        print("FIGURES SAVED: " + str(epoch))
        
        
        
        score_df = pd.DataFrame(columns = ['epoch', 'train_conf', 'train_perp', 'valid_conf', 'valid_perp'])
        score_df['epoch'] = epoch_list
        score_df['train_conf'] = train_conf_list
        score_df['train_perp'] = train_perp_list
        score_df['valid_conf'] = valid_conf_list
        score_df['valid_perp'] = valid_perp_list
        score_df.to_csv(MODEL_OUTPUT_FOLDER+"Scores.csv", index = False)
        
        
        torch.save(model, MODEL_OUTPUT_FOLDER + "model_"+str(epoch))
        print("MODEL SAVED: " + str(epoch))
        

        



In [14]:
# class Dataset_Merged_Shift(torch.utils.data.Dataset):
#     def __init__(self, args, tokenizer, csv_path):
        
#         self.args = args
#         self.tokenizer = tokenizer
#         self.csv_path = csv_path
        
#         self.sentences = self.load_sentences()
#         self.words = len(self.sentences)-self.args["sequence_length"]
        
#         self.uniq_words = self.tokenizer.vocab_list
#         #self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
#         self.word_to_index = self.tokenizer.token2idx

#         #self.words_indexes = [self.word_to_index[w] for w in self.words]
#         #self.words_indexes = self.tokenizer.encode_sentence(" ".join(self.words))
        
#     def load_sentences(self):
#         data_df = pd.read_csv(self.csv_path) # need to input this csv from init to create the test data set
#         text = data_df[COL_NAME].str.cat(sep=' <eos> ')
#         return text.split(' ') # this may be changed for niloys complex split function

    
#     def __len__(self):
# #         return len(self.words_indexes)//self.args["sequence_length"] - self.args["sequence_length"]
#         return int(self.words)


#     def __getitem__(self, index):
#         tokens = []
#         text = ' '.join(self.sentences[index : index+self.args["sequence_length"]])
#         tokens.extend(self.tokenizer.encode_sentence(text))

#         return (
#             torch.tensor([2]+tokens[:-1]),
#             torch.tensor(tokens),
#         )
    
      
        
# def calc_merged(model, dataloader, batch_start, num_batches):
#     print("Total number of batches = ", len(dataloader))
#     model.eval()
#     sm = 0
#     cnt = 0
        
#     state_h, state_c = model.init_state(args['sequence_length'])
#     state_h = state_h.to('cuda')
#     state_c = state_c.to('cuda')
    
#     perp = []
    
#     with torch.no_grad():
#         for batch, (x, y) in enumerate(dataloader):
#             if batch<batch_start :
#                 continue
#             if batch>batch_start+num_batches:
#                 break
            
#             print(datetime.datetime.now(), ":", 'batch :', batch)
#             x = x.to('cuda')
#             y = y.to('cuda')

#             y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            
#             #print(y_pred.size())
#             #print(y_pred.transpose(1, 2).size())
#             #print(y.size())
#             perp_cal = []
#             perp_cal.append(nn.CrossEntropyLoss()(y_pred.transpose(1, 2), y).detach().cpu())
#             perp.append(torch.exp(torch.stack(perp_cal).sum()/len(perp_cal)))
            
#             #y_pred = y_pred.to('cuda')
#             #state_h = state_h.to('cuda')
#             #state_c = state_c.to('cuda')

#             #state_h = state_h.detach()
#             #state_c = state_c.detach()

#         #         print(y_pred.shape)

# #             for idx in range(len(y_pred)):
# #                 joint_sentence_probability = []
# #                 for i in range(1, len(x[idx])):
# #                     p = torch.nn.functional.softmax(y_pred[idx][i-1], dim=0).detach().cpu().numpy()
# #                     joint_sentence_probability.append(p[x[idx][i]])
# #                     score = np.sum([np.log2(i) for i in joint_sentence_probability]) 
# #                     score /= len(joint_sentence_probability)
# #                     score = np.power(2, score)
# #                     sm+=score
# #                     cnt+=1
                
#     model.train()
    
# #     confidence_score = sm/cnt
#     confidence_score = 0
#     print("Confidence Score = ", confidence_score)
    
#     perplexity = float(sum(perp) / len(perp)) 
#     print("Perplexity = ",  perplexity)
#     #print("perplexity = ", torch.stack(perp).sum()/cnt)
#     #print(" actual perplexity = ", torch.exp(torch.stack(perp).sum()/cnt))
#     return confidence_score, perplexity        

# def train_merged(model, args, train_dataloader, valid_dataloader):
#     model.train()

#     print("total number of steps needed for a single epoch = ", len(train_dataloader))
    
#     criterion = nn.CrossEntropyLoss()
#     optimizer = optim.Adam(model.parameters(), lr=0.0001)
# #     optimizer = optim.Adam(model.parameters(), lr=8.88262772170544e-07)
#     scheduler = StepLR(optimizer, step_size=20, gamma=0.99)
    
#     NUM_BATCHES = 446
#     MAX_EP = 30
    
#     for epoch in range(LOADED_EPOCH+1, MAX_EP):
#         state_h, state_c = model.init_state(args['sequence_length'])
#         state_h = state_h.to('cuda')
#         state_c = state_c.to('cuda')

#         for batch, (x, y) in enumerate(train_dataloader):
#             if batch < epoch*NUM_BATCHES:
#                 continue
#             if batch > (epoch+1)*NUM_BATCHES:
#                 break
            
#             x = x.to('cuda')
#             y = y.to('cuda')

#             optimizer.zero_grad()

#             y_pred, (state_h, state_c) = model(x, (state_h, state_c))
#             '''
#             y_pred = y_pred.to('cuda')
#             state_h = state_h.to('cuda')
#             state_c = state_c.to('cuda')
#             '''
#             state_h = state_h.detach()
#             state_c = state_c.detach()
            
#             loss = criterion(y_pred.transpose(1, 2), y).cuda()

#             loss.backward()
#             optimizer.step()
#             scheduler.step()
            
# #             del x
# #             del y
# #             del y_pred
# #             gc.collect()
            
#             #predict(model, text = " i know ")
            
#             #valid_score = calc_confidence(model, valid_dataloader)
            
#             print(datetime.datetime.now(), ":", { 'epoch': epoch, 'batch': batch, 'loss': loss.item(), 'lr':scheduler.get_lr()[0]})
          
#         # end of epoch. generate figure and save model 
#         predict(model, text = "he knows that ")
#         print(datetime.datetime.now(), ":", "Generating Confidence Scores: " + str(epoch))
#         epoch_list.append(epoch)
        
#         train_c, train_p = calc_merged(model, train_dataloader, epoch*NUM_BATCHES, NUM_BATCHES) # takes 6x time than training
#         train_conf_list.append(train_c)
#         train_perp_list.append(train_p)
        
#         valid_c, valid_p = calc_confidence(model, valid_dataloader) 
#         valid_conf_list.append(valid_c)
#         valid_perp_list.append(valid_p)
        
        
#         plt.clf()
#         plt.plot(epoch_list, train_conf_list, color='red', linestyle='dashed')
#         plt.plot(epoch_list, valid_conf_list, color='blue', linestyle='solid')

#         plt.xlabel('Num Epoch')
#         plt.ylabel('Conf_Score')
#         plt.title('Conf_Score VS Epoch')
#         plt.legend(['Train Conf', 'Valid Conf'], loc='upper right')
#         plt.savefig(MODEL_OUTPUT_FOLDER+"Conf_Scores.png" , format='png', dpi=600)
        
#         plt.clf()
#         plt.plot(epoch_list, train_perp_list, color='red', linestyle='dashed')
#         plt.plot(epoch_list, valid_perp_list, color='blue', linestyle='solid')

#         plt.xlabel('Num Epoch')
#         plt.ylabel('Perp')
#         plt.title('Perp VS Epoch')
#         plt.legend(['Train Perp', 'Valid Perp'], loc='upper right')
#         plt.savefig(MODEL_OUTPUT_FOLDER+"Perp_Scores.png" , format='png', dpi=600)
        
#         print("FIGURES SAVED: " + str(epoch))
        
        
        
#         score_df = pd.DataFrame(columns = ['epoch', 'train_conf', 'train_perp', 'valid_conf', 'valid_perp'])
#         score_df['epoch'] = epoch_list
#         score_df['train_conf'] = train_conf_list
#         score_df['train_perp'] = train_perp_list
#         score_df['valid_conf'] = valid_conf_list
#         score_df['valid_perp'] = valid_perp_list
#         score_df.to_csv(MODEL_OUTPUT_FOLDER+"Scores.csv", index = False)
        
        
#         torch.save(model, MODEL_OUTPUT_FOLDER + "model_"+str(epoch))
#         print("MODEL SAVED: " + str(epoch))
        
   


# Start Training

In [15]:
args = {}
args["sequence_length"] = 45 # 40->tokenized, 45->encrypted
args["max_epochs"] = 50
args["batch_size"] = 700

'''
parser.add_argument('--max-epochs', type=int, default=10)
parser.add_argument('--batch-size', type=int, default=256)
parser.add_argument('--sequence-length', type=int, default=4)
'''

train_dataset = Dataset_Sentence_Single(args, tokenizer, TRAIN_CSV_PATH)
train_dataloader = DataLoader(train_dataset, batch_size=args['batch_size'], num_workers = 6, shuffle=True)

valid_dataset = Dataset_Sentence_Single(args, tokenizer, VALID_CSV_PATH)
valid_dataloader = DataLoader(valid_dataset, batch_size=args['batch_size'], num_workers = 6, shuffle=False)


In [None]:
# New Model

model = Model(train_dataset)
model = model.to('cuda')

LOADED_EPOCH = -1
epoch_list = []
train_conf_list = []
train_perp_list = []
valid_conf_list = []
valid_perp_list = []

train(model, args, train_dataloader, valid_dataloader) 
# train_merged(model, args, train_dataloader, valid_dataloader) 
#print(predict(dataset, model, text='Knock knock. Whos there?'))

'''
Confidence Score =  0.0071846294500849
perplexity =  tensor(268.4095)
FIGURE SAVED: 24
MODEL SAVED: 24
'''

total number of steps needed for a single epoch =  512




2020-09-03 22:04:41.062864 : {'epoch': 0, 'batch': 0, 'loss': 8.989461898803711, 'lr': 0.0001}
2020-09-03 22:04:41.533777 : {'epoch': 0, 'batch': 1, 'loss': 8.97198486328125, 'lr': 0.0001}
2020-09-03 22:04:41.981863 : {'epoch': 0, 'batch': 2, 'loss': 8.953956604003906, 'lr': 0.0001}
2020-09-03 22:04:42.498417 : {'epoch': 0, 'batch': 3, 'loss': 8.932896614074707, 'lr': 0.0001}
2020-09-03 22:04:42.936253 : {'epoch': 0, 'batch': 4, 'loss': 8.910138130187988, 'lr': 0.0001}
2020-09-03 22:04:43.405115 : {'epoch': 0, 'batch': 5, 'loss': 8.88172435760498, 'lr': 0.0001}
2020-09-03 22:04:43.837695 : {'epoch': 0, 'batch': 6, 'loss': 8.846268653869629, 'lr': 0.0001}
2020-09-03 22:04:44.274451 : {'epoch': 0, 'batch': 7, 'loss': 8.812833786010742, 'lr': 0.0001}
2020-09-03 22:04:44.708502 : {'epoch': 0, 'batch': 8, 'loss': 8.76875114440918, 'lr': 0.0001}
2020-09-03 22:04:45.143747 : {'epoch': 0, 'batch': 9, 'loss': 8.692233085632324, 'lr': 0.0001}
2020-09-03 22:04:45.579467 : {'epoch': 0, 'batch': 10

2020-09-03 22:05:17.137859 : {'epoch': 0, 'batch': 81, 'loss': 4.445968151092529, 'lr': 9.605960100000001e-05}
2020-09-03 22:05:17.606832 : {'epoch': 0, 'batch': 82, 'loss': 4.42917013168335, 'lr': 9.605960100000001e-05}
2020-09-03 22:05:18.081242 : {'epoch': 0, 'batch': 83, 'loss': 4.352742671966553, 'lr': 9.605960100000001e-05}
2020-09-03 22:05:18.570479 : {'epoch': 0, 'batch': 84, 'loss': 4.418341636657715, 'lr': 9.605960100000001e-05}
2020-09-03 22:05:19.040831 : {'epoch': 0, 'batch': 85, 'loss': 4.332901954650879, 'lr': 9.605960100000001e-05}
2020-09-03 22:05:19.491669 : {'epoch': 0, 'batch': 86, 'loss': 4.310606479644775, 'lr': 9.605960100000001e-05}
2020-09-03 22:05:19.929578 : {'epoch': 0, 'batch': 87, 'loss': 4.366903305053711, 'lr': 9.605960100000001e-05}
2020-09-03 22:05:20.367393 : {'epoch': 0, 'batch': 88, 'loss': 4.34684419631958, 'lr': 9.605960100000001e-05}
2020-09-03 22:05:20.802130 : {'epoch': 0, 'batch': 89, 'loss': 4.336323261260986, 'lr': 9.605960100000001e-05}
202

In [None]:
# # #Preload Model
# LOADED_EPOCH = 26

# MODEL_NAME = MODEL_OUTPUT_FOLDER + "model_"+str(LOADED_EPOCH)
# model = torch.load(MODEL_NAME)
# model = model.to('cuda')

# score_df = pd.read_csv(MODEL_OUTPUT_FOLDER+"Scores.csv")
# epoch_list = score_df['epoch'].values.tolist()
# train_conf_list = score_df['train_conf'].values.tolist()
# train_perp_list = score_df['train_perp'].values.tolist()
# valid_conf_list = score_df['valid_conf'].values.tolist()
# valid_perp_list = score_df['valid_perp'].values.tolist()

# train(model, args, train_dataloader, valid_dataloader)
# # train_merged(model, args, train_dataloader, valid_dataloader) 

# Delete Model from Memory

In [None]:
del model
gc.collect()

# Calculate prediction scores for Stereoset

In [None]:
MODEL_NAME = MODEL_OUTPUT_FOLDER + "model_0"
model = torch.load(MODEL_NAME)
model = model.to('cuda')

In [None]:
calc_stereoset(model)

# Evaluate on StereoSet

In [None]:
# !git clone https://github.com/moinnadeemf = open('Stereoset_gender_intersentece_dev_data/sentences.txt', 'r')
# for line in f.readlines():
#   print(line)/StereoSet.git

In [None]:
!python StereoSet/code/evaluation.py --gold-file $DEV_DATA_PATH --predictions-file $PREDICTION_SCORES_PATH
!cp "results.json" $RESULTS_PATH

# Misc.

In [None]:
prompt = "She was confident in herself, but afraid to face the boys club in the industry. She started"

In [None]:
predict(model, prompt)

In [None]:
#??StepLR

In [None]:
# sent = "<sos> i know you. You are a lair. i know you. You are a lair. i know you. You are a lair"
# tokens = tokenizer.encode_sentence(sent)
# hypothesis = [0]*len(tokens)
# hypothesis[-1] = 2 # <sos> token
# '''
# while len(tokens)<args["sequence_length"]:
#   tokens = [0]+tokens # add initial pads
# tokens = [-args["sequence_length"]:]
# '''

# def calc():
#     model.eval()
#     joint_sentence_probability = []
#     state_h, state_c = model.init_state(len(tokens))
#     state_h = state_h.to('cuda')
#     state_c = state_c.to('cuda')
    
#     x = torch.tensor([tokens]).to('cuda')
    
#     y_pred, (state_h, state_c) = model(x, (state_h, state_c))
#     y_pred = y_pred.to('cuda')
#     state_h = state_h.to('cuda')
#     state_c = state_c.to('cuda')
#     #print(y_pred.size())
#     #return
    
#     for i in range(1, len(tokens)):
#       p = torch.nn.functional.softmax(y_pred[0][i-1], dim=0).cpu().detach().numpy()
#       joint_sentence_probability.append(p[tokens[i]])
#     score = np.sum([np.log2(i) for i in joint_sentence_probability]) 
#     score /= len(joint_sentence_probability)
#     score = np.power(2, score)
#     print(score)
# calc()


In [None]:
# sent = "<sos> i know you. You are a lair. i know you. You are a lair. i know you. You are a lair"
# tokens = tokenizer.encode_sentence(sent)
# print(tokens)

In [None]:
# !ls Stereoset_gender_intersentece_dev_data

In [None]:
# f = open('Stereoset_gender_intersentece_dev_data/sentences.txt', 'r')
# for line in f.readlines():
#   print(line)

In [None]:
# tokenizer.tokenized_sentences[43536] 

In [None]:
# count = 0
# for line in tokenizer.tokenized_sentences:
#     c = 0
#     line  = line.split()
#     for token in line:
#         if token.strip() == ".":
#             c += 1
#     if c > 1:
#         count += 1

        
# print(count)

In [None]:
# a = torch.tensor(124.341)
# print(a)
# print(float(a))