In [1]:
# initialization
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

import sys, os
cur_path = os.path.join('/research/jujun/text_change')
os.chdir(cur_path)

import random, pickle
import numpy as np
from torch.nn import BCEWithLogitsLoss, BCELoss, MSELoss
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, \
                                f1_score, accuracy_score, precision_recall_fscore_support
# import tensorflow as tf
import torch
import pandas as pd
from transformers import AutoConfig, AutoModel, AutoTokenizer, AutoModelForSequenceClassification
import torch.nn as nn
from torchinfo import summary
import torch.nn.functional as F
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils import shuffle
from itertools import cycle
from tqdm import tqdm
import time
import copy
import datetime
from sklearn.metrics import roc_auc_score
# from numba import cuda 

# from pynvml import *
def get_free_gpu():
    print('\n')
    # nvmlInit()
    # h = nvmlDeviceGetHandleByIndex(0)
    # info = nvmlDeviceGetMemoryInfo(h)
    # print(f'total    : {info.total // 1024 ** 2}')
    # print(f'free     : {info.free// 1024 ** 2}')
    # print(f'used     : {info.used// 1024 ** 2}')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_pretrained_wordvector(sentences, tokenizer, bert_model, max_len=100):

    input_ids = []
    attention_masks = []
    max_len = max_len

    # For every sentence...
    for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        #padding='max_length',
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
        # Add the encoded sentence to the list.    
        input_ids.append(encoded_dict['input_ids'])
    
        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])


    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    bert_model.eval()
    with torch.no_grad():
        outputs = bert_model(input_ids.to(device), attention_masks.to(device))   
        hidden_states = outputs[2]

    
    # get the last four layers
    token_embeddings = torch.stack(hidden_states[-4:], dim=0) 
    #print(token_embeddings.size())

    # permute axis
    token_embeddings = token_embeddings.permute(1,2,0,3)
    #print(token_embeddings.size())

    # take the mean of the last 4 layers
    token_embeddings = token_embeddings.mean(axis=2)

    #print(token_embeddings.size())
    input_ids.detach().to('cpu')
    attention_masks.detach().to('cpu')
    token_embeddings.detach().to('cpu')
    del input_ids
    return token_embeddings, attention_masks

In [3]:
def get_text_embedding(cik, fyear, fyear_bf, tokenizer, bert_model, para_map, para_len, wrd_len=100):
    # print(cik, fyear, fyear_bf)
    df = pd.concat({k: pd.Series(v) for k, v in para_map[cik].items()})
    df = df.reset_index()
    df.columns = ['fyear','pid','text']

    input = df[df.fyear == fyear].text.values
    input_bf = df[df.fyear == fyear_bf].text.values

    #get embedding for input
    token_embeddings, masks = get_pretrained_wordvector(input, tokenizer, bert_model, max_len = wrd_len)
    token_embeddings = token_embeddings.to(device) * masks.unsqueeze(-1).to(device) # (atc_num_para, #wrd_len, #dim)
    # padding paragraphs
    # print('1 token_embeddings',token_embeddings.size())
    pad_num = para_len - token_embeddings.size()[0]
    if pad_num>0:
        token_embeddings = F.pad(input=token_embeddings, pad=(0,0,0,0,0,pad_num))
        # print('2 token_embeddings',token_embeddings.size())
    elif pad_num<0:
        token_embeddings = token_embeddings[0:para_len]
        # print('2 token_embeddings',token_embeddings.size())
    else:
        token_embeddings = token_embeddings

    #get embedding for input_bf
    token_embeddings_bf, masks_bf = get_pretrained_wordvector(input_bf, tokenizer, bert_model, max_len = wrd_len)
    token_embeddings_bf = token_embeddings_bf.to(device) * masks_bf.unsqueeze(-1).to(device) # (atc_num_para, #wrd_len, #dim)
    # padding paragraphs
    # print('1 token_embeddings_bf',token_embeddings_bf.size())
    pad_num_bf = para_len - token_embeddings_bf.size()[0]
    #print('pad_num_bf', pad_num_bf)
    if pad_num_bf>0:
        # print('>0')
        token_embeddings_bf = F.pad(input=token_embeddings_bf, pad=(0,0,0,0,0,pad_num_bf))
        # print('2 token_embeddings_bf',token_embeddings_bf.size())
    elif pad_num_bf<0:
        # print('<0')
        token_embeddings_bf = token_embeddings_bf[0:para_len]
        # print('2 token_embeddings_bf',token_embeddings_bf.size())
    else:
        token_embeddings_bf = token_embeddings_bf

    return token_embeddings, token_embeddings_bf


In [10]:
# define model
class simple_siamese(nn.Module):
    def __init__(self, config):
        super().__init__()
        
        self.emb_dim = config.emb_dim
        self.wrd_len = config.wrd_len
        self.num_filters = config.num_filters
        self.kernel_sizes = config.kernel_sizes
        self.kernel_sizes2 = config.kernel_sizes2
        self.kernel_sizes3 = config.kernel_sizes3
        self.dropout_rate = config.dropout_rate
        self.num_classes = config.num_classes
        self.test_mode = config.test_mode

        self.conv1 = nn.Sequential(
            nn.Conv2d(768, 128, kernel_size = self.kernel_sizes), # input (#batch, 768, num_para->30, num_words->50) # kernal size = 10  # output: (#batch, 128, 30, 40)
            nn.Conv2d(128, 64,  kernel_size = self.kernel_sizes2), # input (#batch, 768, num_para->30, num_words->50) # kernal size = 10  # output: (#batch, 128, 30, 40)
            nn.ReLU(inplace=True),
            nn.MaxPool2d((1,3), padding=0),  # input (#batch, 128, 30, 40) #output (#batch, 128, 30, 13)
            # nn.MaxPool1d(3, padding=0),  # input (#batch, 128, 30, 40) #output (#batch, 128, 30, 13)
            # nn.Conv2d(128, 64,  kernel_size = kernel_sizes2), # input (#batch, 256, num_para->10, num_words->10) # kernal size = 5
            # nn.ReLU(inplace=True),
            # nn.MaxPool2d((1,1), padding=0),
            # nn.ReLU(), 
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(64, 32,  kernel_size = self.kernel_sizes2), # input (#batch, 768, num_para->30, num_words->50) # kernal size = 10  # output: (#batch, 128, 30, 40)
            nn.Conv2d(32, 16,  kernel_size = self.kernel_sizes3),
            nn.ReLU(inplace=True),
            nn.MaxPool2d((2,2), padding=0),
        )

        
        linear_size = 64
        self.fc1 = nn.Linear(2112, linear_size)
        self.fc2 = nn.Linear(linear_size, int(self.num_classes))
        self.norl = nn.BatchNorm1d(linear_size)
        self.dropout = nn.Dropout(p=self.dropout_rate)
        

    def forward(self, input1, input2):
        #permute input to make it fit cnn
        x1 = torch.permute(input1, (0,3,1,2))
        x2 = torch.permute(input2, (0,3,1,2))
        # print(x1.size())
        # print(x2.size())

        x1 = self.conv1(x1)
        x2 = self.conv1(x2)
        if self.test_mode:
            print('---conv1 output---')
            print(x1.size())
            print(x2.size())
        # absolute distance    
        # x = torch.abs(torch.sub(x1,x2))
        # euclidean distance
        x = torch.cdist(x1, x2, p=2)

        
        # print(x.size())
        x = self.conv2(x)
        if self.test_mode:
            print('---conv2 output---')
            print(x.size())
        
        x = torch.reshape(x,(x.size()[0],-1))
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.dropout(x)
        # x = self.norl(x)
        logit = self.fc2(x)
        # print('model output',logit.size())

        x1 = torch.reshape(x1,(x1.size()[0],-1))
        x2 = torch.reshape(x2,(x2.size()[0],-1))

        return logit, x1, x2   
        # return x1, x2


In [11]:
global __pred_probs
global __labels_bools

In [14]:
class config:
    def __init__(self):
        self.emb_dim = 768
        self.wrd_len = 64  # 100
        self.para_len = 32  # 60
        self.num_filters = 128
        self.kernel_sizes = (1, 10)
        self.kernel_sizes2 = (5, 3)  # (2,2)
        self.kernel_sizes3 = (3, 3)
        self.dropout_rate = 0.2
        self.num_classes = 2.0
        self.num_labels = 2
        self.batch_size = 64
        self.para_map = None
        self.class_weight = 1
        self.test_mode = False

    def set_parm_map(self, para_map):
        self.para_map = para_map

    @staticmethod
    def test_model(model, model_config):
        # If there's a GPU available...
        if torch.cuda.is_available():
            # Tell PyTorch to use the GPU.
            id = 1
            torch.cuda.set_device(1)
            device = torch.device("cuda")
            print('There are %d GPU(s) available.' % torch.cuda.device_count())
            print('We will use the GPU:', torch.cuda.get_device_name(id))
            print(torch.cuda.current_device())
        # If not...
        else:
            print('No GPU available, using the CPU instead.')
            device = torch.device("cpu")
        display(summary(model, [(model_config.batch_size, model_config.para_len, model_config.wrd_len,
                768), (model_config.batch_size, model_config.para_len, model_config.wrd_len, 768)]))


siamese_config = config()
model = simple_siamese(siamese_config)
config.test_model(model=model, model_config=siamese_config)


There are 2 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1080 Ti
1


Layer (type:depth-idx)                   Output Shape              Param #
simple_siamese                           [64, 2]                   128
├─Sequential: 1-1                        [64, 64, 28, 17]          --
│    └─Conv2d: 2-1                       [64, 128, 32, 55]         983,168
│    └─Conv2d: 2-2                       [64, 64, 28, 53]          122,944
│    └─ReLU: 2-3                         [64, 64, 28, 53]          --
│    └─MaxPool2d: 2-4                    [64, 64, 28, 17]          --
├─Sequential: 1-2                        [64, 64, 28, 17]          (recursive)
│    └─Conv2d: 2-5                       [64, 128, 32, 55]         (recursive)
│    └─Conv2d: 2-6                       [64, 64, 28, 53]          (recursive)
│    └─ReLU: 2-7                         [64, 64, 28, 53]          --
│    └─MaxPool2d: 2-8                    [64, 64, 28, 17]          --
├─Sequential: 1-3                        [64, 16, 11, 12]          --
│    └─Conv2d: 2-9                       [64, 3

In [15]:
def model_eval(model, validation_dataloader, num_labels, class_weight=None):
    #tokenized_texts = []
    true_labels = []
    pred_labels = []

    threshold = 0.5

    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in validation_dataloader:
        # print('val 1 free gpu',get_free_gpu())
        b_input_key = batch[0]
        b_labels = batch[1].to(device)


        #convert key to text embedding
        tk_batch = []
        tk_batch_bf = []
        #print('val batch',batch)
        for t in b_input_key.detach().to('cpu').numpy():
            tk, tk_bf = get_text_embedding(t[0], t[1], t[2], tokenizer, bert_model, para_map, para_len, wrd_len=wrd_len)
            if tk.size()[0] == para_len:              
                tk_batch.append(tk)
                tk_batch_bf.append(tk_bf)
            else:
                print('token size error')
                break
            

        tk_batch = torch.stack(tk_batch)
        tk_batch = tk_batch.to(device)

        tk_batch_bf = torch.stack(tk_batch_bf)
        tk_batch_bf = tk_batch_bf.to(device)
        # print('val 2 free gpu',get_free_gpu())

        with torch.no_grad():

            logits, x1, x2 = model(tk_batch, tk_batch_bf)
            cos_sim = nn.CosineSimilarity(dim=1, eps=1e-6)
            sim = cos_sim(x1,x2)
            sim = sim.reshape(-1,1)
            #loss_func = BCELoss()
            #val_loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation

            tk_batch.detach().to('cpu')
            del tk_batch
            tk_batch_bf.detach().to('cpu')
            del tk_batch_bf           
            # print('val 3 free gpu',get_free_gpu())
            
            if class_weight != None:
                pos_weight = torch.tensor(class_weight).to(device)
                # weights = torch.tensor([pos_weight]).to(device)
                ct_loss = nn.CrossEntropyLoss() #weight = weights
                loss_func = BCEWithLogitsLoss(pos_weight=pos_weight)
            else:
                ct_loss = nn.CrossEntropyLoss()
                loss_func = BCEWithLogitsLoss()

            global set_ct_loss
            if set_ct_loss == True:
                val_loss =  loss_func(logits,b_labels.type_as(logits)) \
                    -  ct_loss(sim, torch.argmax(b_labels,axis=1).type_as(sim).reshape(-1,1))  #convert labels to float for calculation
            else: 
                val_loss =  loss_func(logits,b_labels.type_as(logits))

            total_eval_loss += val_loss.item()
            

            pred_label = torch.softmax(logits, dim=1)
            b_labels = b_labels.to('cpu').numpy()
            pred_label = pred_label.to('cpu').numpy()

            #tokenized_texts.append(b_input_ids)
            true_labels.append(b_labels)
            pred_labels.append(pred_label)

    # Flatten outputs
    pred_labels = np.vstack(pred_labels)
    true_labels = np.vstack(true_labels)

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    return  pred_labels, true_labels, avg_val_loss

In [16]:
def train_model(model, config,  train_dataloader, validation_dataloader, model_path,\
                             optimizer=None, scheduler=None, epochs = 10, \
                             class_weight = None, patience = 5):

    seed_val = 1234

    threshold = 0.5
    #model_path = 'best_model.model'  # save the best model

    para_len = config.para_len
    wrd_len = config.wrd_len
    para_map = config.para_map
    class_weight = config.class_weight
    num_labels = config.num_labels
    verbose_mode = True
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    training_stats = []

    best_score = -0.5
    best_epoch = 0
    cnt = 0

    total_t0 = time.time()

    if optimizer == None:
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    # For each epoch...
    for epoch_i in range(0, epochs):

        # ========================================
        #               Training
        # ========================================

        # Perform one full pass over the training set.

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_train_loss = 0
        model.train()

        # For each batch of training data...
        train_true_labels = []
        train_pred_labels = []
        for step, batch in enumerate(train_dataloader):
            
            # `batch` contains three pytorch tensors:
            #   [0]: (cik, fyear, fyear_bf)
            #   [1]: labels
            
            # print('1 free gpu',get_free_gpu())
            b_input_key = batch[0] # batch_size * (cik, fyear, fyear_bf)
            b_labels = batch[1].to(device)
            
            
            #convert key to text embedding
            tk_batch = []
            tk_batch_bf = []
            #print('b_input_key',b_input_key)
            time_start_tk = time.time()
            for t in b_input_key.detach().to('cpu').numpy():
                tk, tk_bf = get_text_embedding(t[0], t[1], t[2], tokenizer, bert_model, para_map, para_len, wrd_len=wrd_len)
                if tk.size()[0] == para_len:              
                    tk_batch.append(tk)
                    tk_batch_bf.append(tk_bf)
                    # print(len(tk_batch), len(tk_batch_bf))
                else:
                    print('token size error')
                    break
            # print(len(tk_batch), len(tk_batch_bf))
            # print("----- token %s seconds -----" % (time.time() - time_start_tk))
                
            tk_batch = torch.stack(tk_batch)
            tk_batch = tk_batch.to(device)
            
            tk_batch_bf = torch.stack(tk_batch_bf)
            tk_batch_bf = tk_batch_bf.to(device)
            #  print('2 free gpu',get_free_gpu())
            

            time_start_batch_train = time.time()
            logits, x1, x2 = model(tk_batch,tk_batch_bf)
            cos_sim = nn.CosineSimilarity(dim=1, eps=1e-6)
            sim = cos_sim(x1,x2)
            sim = sim.reshape(-1,1)
            #print("logits shape: ", b_input_ids.size(), b_labels.size(), logits.shape())
            #loss_func = BCELoss()
            #loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation

            # add class weight
            if class_weight != None:
                pos_weight = torch.tensor(class_weight).to(device)
                weights = torch.tensor([pos_weight]).to(device)
                ct_loss = nn.CrossEntropyLoss()#weight = weights
                loss_func = BCEWithLogitsLoss(pos_weight=pos_weight)
            else:
                ct_loss = nn.CrossEntropyLoss()
                loss_func = BCEWithLogitsLoss()
            
            tk_batch.detach().to('cpu')
            del tk_batch
            tk_batch_bf.detach().to('cpu')
            del tk_batch_bf
            
            # print('3 free gpu',get_free_gpu())
            # print(logits.size(), b_labels.size())
#             loss = loss_func(
#                 logits.view(-1, num_labels),
#                 b_labels.type_as(logits).view(
#                     -1, num_labels))  
            # convert labels to float for calculation
            # global my_ct_loss, my_sim, my_label
            # my_sim = sim
            # my_label = b_labels
            # my_ct_loss = ct_loss(sim, torch.argmax(b_labels,axis=1).type_as(logits).reshape(-1,1)) 
            global set_ct_loss
            global lista 

            if verbose_mode:
                print("logits: ", logits)
                print("b_labels.type_as(logits): ", b_labels.type_as(logits))
                
                train_pred_bools = torch.argmax(logits, axis=1)
                train_pred_bools = train_pred_bools.to('cpu').numpy()
                train_true_bools = torch.argmax(b_labels.type_as(logits), axis=1)
                train_true_bools = train_true_bools.to('cpu').numpy()
                # print(train_pred_bools.shape, train_true_bools.shape)

                train_true_labels += train_true_bools.tolist()
                train_pred_labels += train_pred_bools.tolist()
                print("train_pred_bools", train_pred_bools)
                print("train_true_bools", train_true_bools)
                

            if set_ct_loss == True:
                loss =  loss_func(logits,b_labels.type_as(logits)) \
                    -  ct_loss(sim, torch.argmax(b_labels,axis=1).type_as(sim).reshape(-1,1))  #convert labels to float for calculation
            else: 
                loss =  loss_func(logits, b_labels.type_as(logits))

            total_train_loss += loss.item()
            # print(f"train step loss: {step} -- {loss}")
            # print(f"train step total_train_loss: {step} -- {total_train_loss}")

            model.zero_grad()
            
            loss.backward()

            # torch.nn.utils.clip_grad_norm_(model.parameters(), 10.0)

            optimizer.step()

            # Update the learning rate.
            if scheduler != None:
                scheduler.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)

        # Measure how long this epoch took.
        training_time = time.time() - t0
        print("Total training_time took {0:.2f} minutes ".format(training_time/60))

        # calculate the total accrurcy in this epoch
        # print(train_true_labels[0:1])
        global lista
        global listb
        lista = train_true_labels
        listb = train_pred_labels
        train_true_labels =  np.array(train_true_labels)
        train_pred_labels = np.array(train_pred_labels)
        print('training acc', (train_true_labels == train_pred_labels).sum(),len(train_true_labels) )
        train_acc = (train_true_labels == train_pred_labels).sum()/len(train_true_labels)


        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        testing = True

        if testing:
            print("")
            print("Running Validation...")

            t1 = time.time()

            # Put the model in evaluation mode--the dropout layers behave differently
            # during evaluation.
            model.eval()

            pred_labels, true_labels, avg_val_loss = model_eval(
                model,  validation_dataloader, num_labels, class_weight=class_weight)

            global val_label_save
            global val_true_label_save
            val_label_save.append(pred_labels)
            val_true_label_save.append(true_labels)


            pred_bools = np.argmax(pred_labels, axis=1)
            true_bools = np.argmax(true_labels, axis=1)

            val_f1 = f1_score(true_bools, pred_bools, average=None) * 100
            val_f1 = val_f1[1]  # return f1 for  class 1
            val_acc = (pred_bools == true_bools).astype(int).sum() / len(pred_bools)
            val_auc = roc_auc_score(true_bools, pred_labels[:,1])

            #print('Validation Accuracy: {0:.4f}, F1: {1:.4f}, Loss: {2:.4f}'.format(val_f1, val_acc, avg_val_loss))
            #print(classification_report(np.array(true_labels), pred_bools, target_names=label_cols) )
            print("Epoch {0}\t Train Loss: {1:.4f}\t  Train ACC: {2:.4f}\t Val Loss {3:.4f}\t Val Acc: {4:.4f}\t Val F1: {5:.4f}\t Val AUC: {6:.4f}".\
                format(epoch_i +1, avg_train_loss, train_acc, avg_val_loss, val_acc, val_f1, val_auc))

            # Measure how long the validation run took.
            validation_time = time.time() - t1
            print("Total val_time took {0:.2f} minutes ".format(validation_time/60))

            #print("  Validation Loss: {0:.2f}".format(val_f1_accuracy))
            #print("  Validation took: {:}".format(validation_time))

            # Record all statistics from this epoch.
            training_stats.append({
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': val_f1,
                'Valid. AUC':val_auc,
                'Best F1': best_score,
                'Best epoch': best_epoch
                #'Training Time': training_time,
                #'Validation Time': validation_time
            })

            # early stopping
            if val_f1 > best_score:
                best_score = val_f1
                best_epoch = epoch_i + 1
                torch.save(copy.deepcopy(model.state_dict()), model_path)
                print("model saved")
                cnt = 0
            else:
                cnt += 1
                if cnt == patience:
                    print("\n")
                    print("early stopping at epoch {0}".format(epoch_i + 1))
                    break

            print("")
            #print("Training complete!")

            print("Total training took {0:.2f} minutes".format((time.time()-total_t0)/60))
        else:
            training_stats = 0
            print(avg_train_loss)
        
    return model, training_stats

In [18]:
if __name__ == "__main__":

    # If there's a GPU available...
    if torch.cuda.is_available():    
        # Tell PyTorch to use the GPU. 
        id = 1 
        torch.cuda.set_device(1)
        device = torch.device("cuda")
        print('There are %d GPU(s) available.' % torch.cuda.device_count())
        print('We will use the GPU:', torch.cuda.get_device_name(id))
        print(torch.cuda.current_device())
    # If not...
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")

    # print(get_free_gpu())

    # load data
    para_map = pickle.load(open("/research/rliu/fraud/data/mda/paragraphs_1994_2016.pkl","rb"))
    pos_neg_pair = pd.read_csv('./data/pos_neg_pair.csv')
    pos_neg_pair = pos_neg_pair.dropna()

    imbalance =False
    if not imbalance:
        pos_index = pos_neg_pair[pos_neg_pair.fraud == 1].index[0:100]
        neg_index = pos_neg_pair[pos_neg_pair.fraud == 0].sample(len(pos_index)).index
        df = pos_neg_pair.loc[neg_index.append(pos_index),:]
        print(df.shape)
    else:
        pos_cik = list(set(pos_neg_pair[pos_neg_pair.fraud == 1].cik))
        neg_cik = list(set(pos_neg_pair[pos_neg_pair.fraud == 0].cik))
        neg_cik = [c for c in neg_cik if c not in pos_cik]
        neg_cik = random.sample(neg_cik, len(pos_cik))
        df = pos_neg_pair[pos_neg_pair.cik.isin(pos_cik[0:10] + neg_cik[0:10])]
        print(df.shape)
    print('successfully load data ...')
    


    emb_dim = siamese_config.emb_dim
    wrd_len = siamese_config.wrd_len
    para_len = siamese_config.para_len
    num_filters = siamese_config.num_filters
    kernel_sizes =  siamese_config.kernel_sizes
    kernel_sizes2 = siamese_config.kernel_sizes2
    kernel_sizes3 = siamese_config.kernel_sizes3
    dropout_rate = siamese_config.dropout_rate
    num_classes= siamese_config.num_classes
    batch_size = siamese_config.batch_size
    
    siamese_config.set_parm_map(para_map)
    para_map = siamese_config.para_map
    class_weight = siamese_config.class_weight

    set_ct_loss = False
    result = []
    val_label_save = []
    val_true_label_save = []
    label_cols = ['fraud']

    # global my_ct_loss
    # global my_sim
    # global my_label

    #embedding
    print('Loading BERT tokenizer...')
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


    bert_model = AutoModel.from_pretrained(
        # 'ProsusAI/finbert',
        'bert-base-uncased',
        # 'yiyanghkust/finbert-pretrain',
        num_labels = 2, 
        output_attentions = False, # Whether the model returns attentions weights.
        output_hidden_states = True, # Whether the model returns all hidden-states.
        )
    bert_model.cuda()

    for col in label_cols:
        print("\n------------")
        print(col)
        print("------------")

        y = df[col].astype(int).values
        x_key = df[['cik', 'fyear', 'fyear_bf']].values

        fold = 0

        skf = StratifiedKFold(n_splits=3, random_state=0, shuffle=True)

        for train_index, test_index in skf.split(x_key, y):

            print("\nfold {} \n".format(fold))

            fold += 1
            X_train, X_test = x_key[train_index], x_key[test_index]
            X_train = torch.tensor(X_train)
            X_test = torch.tensor(X_test)

            Y_train, Y_test = y[train_index], y[test_index]
            print('train fraud', sum(Y_train),'test fraud', sum(Y_test))

            Y_train = pd.get_dummies(Y_train).values
            Y_train = torch.tensor(Y_train)

            Y_test = pd.get_dummies(Y_test).values
            Y_test = torch.tensor(Y_test)

            train_dataset = TensorDataset(X_train, Y_train)
            val_dataset = TensorDataset(X_test, Y_test)

            train_dataloader = DataLoader(
                train_dataset,  # The training samples.
                sampler=RandomSampler(train_dataset),  # Select batches randomly
                batch_size=batch_size  # Trains with this batch size.
            )

            validation_dataloader = DataLoader(
                val_dataset,  # The validation samples.
                sampler=RandomSampler(
                    val_dataset),  # Pull out batches sequentially.
                batch_size=batch_size  # Evaluate with this batch size.
            )

            if class_weight == None:
                pass
            else:
                train_sample_weight = np.array(
                    [class_weight if i[1] == 1 else 1 for i in Y_train])
                test_sample_weight = np.array(
                    [class_weight if i[1] == 1 else 1 for i in Y_test])

            model_name = "./model/simple_siamese_" + str(fold)
            #model = cnn(emb_dim, seq_len, num_filters, kernel_sizes, num_labels)
            model = simple_siamese(siamese_config)
            model.to(device)


            model, training_stats = train_model(model, siamese_config, train_dataloader, validation_dataloader, \
                                                            model_path = model_name, class_weight = class_weight,\
                                                            optimizer=None, scheduler=None, epochs = 20)

            print("load the best model ... ")

            model.load_state_dict(torch.load(model_name))

            # show performance of best model
            model.eval()
            pred_labels, true_labels,avg_val_loss = model_eval(model, \
                                                    validation_dataloader, num_classes, class_weight = class_weight)

            pred_bools = np.argmax(pred_labels, axis = 1)
            print("np.argmax for pred_labels", pred_labels)
            true_bools = np.argmax(true_labels, axis = 1)
            print("np.argmax for true_labels", true_labels)

            p, r, f, _ = precision_recall_fscore_support(true_bools,pred_bools, pos_label = 1)
            #val_f1 = f1_score(true_bools,pred_bools, average = None)*100
            #val_f1 = val_f1[1] # return f1 for  class 1
            val_acc = (pred_bools == true_bools).astype(int).sum()/len(pred_bools)
            val_auc = roc_auc_score(true_bools, pred_labels[:,1])

            print('Precision: {0:.4f}, Recall: {1:.4f}, F1: {2:.4f}, Loss: {3:.4f}, AUC: {4:.4f}'.format(p[1], r[1], f[1], avg_val_loss, val_auc))
            print(classification_report(true_bools, pred_bools) )


            result.append([col, fold, p[1], r[1], f[1], val_acc, val_auc,training_stats[-1]["Best epoch"]])
            with open("./result/simple_siamese.pkl", "wb") as fp:   #Pickling
                pickle.dump(result, fp)
            
            torch.cuda.empty_cache()
            get_free_gpu()
    print('=== finish  === ')

There are 2 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1080 Ti
1
(200, 5)
successfully load data ...
Loading BERT tokenizer...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max 


------------
fraud
------------

fold 0 

train fraud 67 test fraud 33

Training...
logits:  tensor([[ 0.0990, -0.2323],
        [ 0.0474, -0.2948],
        [ 0.1105, -0.1732],
        [ 0.0398, -0.1067],
        [ 0.0486, -0.1090],
        [ 0.0487, -0.0928],
        [ 0.0345, -0.1114],
        [ 0.0971, -0.1416],
        [ 0.1183, -0.2894],
        [ 0.0715, -0.2682],
        [ 0.0533, -0.0965],
        [ 0.0224, -0.2900],
        [ 0.0426, -0.0986],
        [ 0.0508, -0.0896],
        [ 0.0831, -0.1854],
        [ 0.0946, -0.1689],
        [ 0.1213, -0.2546],
        [ 0.1003, -0.2841],
        [ 0.0862, -0.1819],
        [ 0.0678, -0.2281],
        [ 0.0308, -0.1988],
        [ 0.2231, -0.1647],
        [ 0.0631, -0.1540],
        [ 0.0459, -0.1330],
        [ 0.1070, -0.1203],
        [ 0.0598, -0.2135],
        [ 0.0478, -0.1036],
        [ 0.0489, -0.0906],
        [ 0.0471, -0.0930],
        [ 0.0723, -0.1969],
        [ 0.0334, -0.1891],
        [-0.0058, -0.3112],
        [ 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


logits:  tensor([[-0.0556, -0.1258],
        [-0.0933, -0.0981],
        [-0.0806, -0.0910],
        [ 0.0089, -0.0902],
        [ 0.0120, -0.0590],
        [-0.0836, -0.0691],
        [-0.0801, -0.1523],
        [-0.0796, -0.1188],
        [-0.0381, -0.1481],
        [-0.0392, -0.0800],
        [-0.0224, -0.0622],
        [-0.0740, -0.0422],
        [-0.0737, -0.1280],
        [-0.0914, -0.0640],
        [-0.0859, -0.0727],
        [-0.1215, -0.1582],
        [-0.0495, -0.1215],
        [-0.1221, -0.0829],
        [-0.0243, -0.1587],
        [-0.1162, -0.0831],
        [-0.0756, -0.1310],
        [-0.0705, -0.0942],
        [-0.0485, -0.0994],
        [-0.0630, -0.0862],
        [-0.0898, -0.0640],
        [-0.0396, -0.0949],
        [-0.0950, -0.0690],
        [-0.0614, -0.1988],
        [-0.1021, -0.0639],
        [ 0.0068, -0.1377],
        [-0.0987, -0.0710],
        [-0.0506, -0.0989],
        [-0.0923, -0.0753],
        [-0.0739, -0.0785],
        [-0.0772, -0.0602],
        [-0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
lista

NameError: name 'lista' is not defined

In [None]:
len(lista)

67

In [None]:
len(listb)

67

In [None]:
(np.array(lista) == np.array(listb)).sum()/len(listb)

0.5074626865671642

In [None]:
len(temp)

128

In [None]:
torch.softmax(torch.tensor([[1.2,1.8],[0.4,0.7]]),dim=1)

tensor([[0.3543, 0.6457],
        [0.4256, 0.5744]])

In [None]:
len(val_true_label_save)

18

In [None]:
val_label_save[0].shape

(200, 2)

In [None]:
b = val_label_save[0].sum(axis=1)
b.shape

(200,)

In [None]:
b

array([1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 0.99999994, 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       0.99999994, 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 0.99999994,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 0.99999994, 0.99999994,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.     

In [None]:
a = val_label_save[-6][:,1]

In [None]:
a[a>0.5]

array([0.5232506 , 0.52386016, 0.5344516 , 0.53550726, 0.5339036 ,
       0.5286611 , 0.54368126, 0.52959067, 0.53475684, 0.5256853 ,
       0.5267108 , 0.5245599 , 0.5256226 , 0.53362614, 0.5177412 ,
       0.5290738 , 0.529948  , 0.52800107, 0.535685  , 0.5296777 ,
       0.5218666 , 0.5270513 , 0.5201257 , 0.53249747, 0.52939105,
       0.5444209 , 0.52398705, 0.52389795, 0.52814114, 0.5319519 ,
       0.542003  , 0.52385616, 0.5255244 , 0.54056376, 0.53032464,
       0.5200995 , 0.5266097 , 0.5445139 , 0.5436612 , 0.53760165,
       0.52690154, 0.5336137 , 0.52661014, 0.5175091 , 0.5360116 ,
       0.5146469 , 0.5444209 , 0.52182204, 0.52531785, 0.53059375,
       0.52379906, 0.5237532 , 0.52738565, 0.537102  , 0.5298776 ,
       0.5313965 , 0.5444209 , 0.5305427 , 0.53344834, 0.5347967 ,
       0.5347793 , 0.53182554, 0.52058893, 0.5248482 , 0.5445116 ,
       0.53187764, 0.52167225, 0.5279042 , 0.5444209 , 0.52309513,
       0.5233589 , 0.526377  , 0.52767414, 0.53171754, 0.52782

In [None]:
# load data
para_map = pickle.load(open("/research/rliu/fraud/data/mda/paragraphs_1994_2016.pkl","rb"))
pos_neg_pair = pd.read_csv('./data/pos_neg_pair.csv')
pos_neg_pair = pos_neg_pair.dropna()


In [None]:
pos_neg_pair[pos_neg_pair.fraud == 1].cik.value_counts()

75208      12
803014     10
849547     10
6284       10
859475     10
           ..
928395      1
932112      1
18498       1
947431      1
1604028     1
Name: cik, Length: 328, dtype: int64

In [None]:
pos_neg_pair[pos_neg_pair.cik == 1604028]

Unnamed: 0,cik,fyear,count_para,fraud,fyear_bf
150242,1604028,2015,183,1.0,2014.0
150243,1604028,2016,152,0.0,2015.0


In [None]:
pos_cik = list(set(pos_neg_pair[pos_neg_pair.fraud == 1].cik))
neg_cik = list(set(pos_neg_pair[pos_neg_pair.fraud == 0].cik))

In [None]:
neg_cik = [c for c in neg_cik if c not in pos_cik]
neg_cik = random.sample(neg_cik, len(pos_cik))

In [None]:
print(len(neg_cik), len(pos_cik))

328 328


In [None]:
len(pos_cik + neg_cik)

656

In [None]:

print('successfully load data ...')
df = pos_neg_pair[pos_neg_pair.cik.isin(pos_cik + neg_cik)]
print(df.shape)

successfully load data ...
(6293, 5)


In [None]:
len(set(df.cik))

656

In [None]:
my_label_1

NameError: name 'my_label_1' is not defined

In [None]:
result_tb = pd.DataFrame(result, columns=['label','fold','precison','recall','f1','acc','auc','best_epoch'])

In [None]:
torch.tensor([2,3])

tensor([2, 3])

In [None]:
pos_weight = 10
weights = torch.tensor([1,pos_weight]).to(device)

In [None]:
ct_loss = nn.CrossEntropyLoss(weight = weights)

In [None]:
ct_loss = nn.CrossEntropyLoss(weight = weights)

In [None]:
my_label

tensor([[1, 0],
        [0, 1],
        [1, 0],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [1, 0],
        [1, 0],
        [0, 1],
        [1, 0],
        [1, 0]], device='cuda:1', dtype=torch.uint8)