In [1]:
from transformers import BertTokenizer, BertModel

import numpy as np
import pandas as pd
#from transformers import BertTokenizer, BertModel
import torch


from tqdm.notebook import tqdm


from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn.metrics import f1_score
import random
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from pycm import *

import math
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss

import time
import torch.optim as optim
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
# transform the dataset
from imblearn.over_sampling import SMOTE
from sklearn.metrics import matthews_corrcoef

In [2]:
MAIN_DF = pd.read_csv("D:\\Mohammad Dehghan\\NAICS\\Data\\Matched CAN Results from _XZ\\Can Matched Large Bert Embeddings 221115\\ca_matched_bert_embed_df_221114.csv",
                                encoding='utf-8-sig')

In [3]:
################################################################################
# 
# Preprocessing function
# Cleanses the NAICS column, extraction the year and the month of 
# the date column and filtering out the applications that have been file before 2017.

# Args:
#   DF (dataframe): The main dataset with variables such as file number, firm size, file dating,
#                     and NAICS code

# Returns: 
#  MAIN_DF (dataframe): A dataframe in which the NAICS column has been cleaned and date column 
#                       has been splitted into two columns YEAR and MONTH.
################################################################################

def PREPROCESSING(DF):
    COL = input("Insert the name of the NAICS column to be cleaned: ")
    CLEANED_DF = DF[DF[COL] != "000000"]
    CLEANED_DF = CLEANED_DF[CLEANED_DF[COL] != "      "]
    CLEANED_DF=CLEANED_DF.dropna(subset=[COL])
    CLEANED_DF[COL] = CLEANED_DF[COL].astype(int)
    CLEANED_DF = CLEANED_DF[CLEANED_DF[COL] != 0]
    CLEANED_DF[COL] = CLEANED_DF[COL].astype(str)
    CLEANED_DF[COL]=CLEANED_DF[COL].str[:6]
    
    DATE = input("Insert the name of the date column: ")

    CLEANED_DF["YEAR"] = pd.DatetimeIndex(CLEANED_DF[DATE]).year
    CLEANED_DF["MONTH"] = pd.DatetimeIndex(CLEANED_DF[DATE]).month
    CLEANED_DF=CLEANED_DF[CLEANED_DF['YEAR']>2016]
    CLEANED_DF=CLEANED_DF.reset_index(drop=True)
    
    return CLEANED_DF

In [19]:
################################################################################
# 
# Aggregating function
# Sums the BERT embedding vectors came from the observations in each application.
# Here in our case, the large based BERT embedding has 1024 elements (X1,...,X1024).
# Args:
#   DF (dataframe): A cleaned dataset which is the output of the preprocessing function.
# Returns: 
#  DF_ (dataframe): An aggregated dataframe based on embedding vectors.
################################################################################

def AGGREGATE(DF):
    FILE_NUMBER = input("Insert the name of the unique file number column: ")
    col_x = []
    for i in range(1024):
        col_x.append("X" + str(i+1))

    DF_GROUP = DF.groupby(FILE_NUMBER)[col_x].sum().reset_index()
    
    DF_NOT_X = DF.drop(col_x, axis=1)


    DF_GROUP_MERGE = DF_NOT_X.merge(DF_GROUP,
                                on=[FILE_NUMBER],
                                            how='left')
    DF_ = DF_GROUP_MERGE.drop_duplicates([FILE_NUMBER], keep='last')
    
    return DF_

In [14]:
################################################################################
# 
# Extract function
# Extracts the applications associated with NAICS codes which have more than 10 applications.
# This is because the smote() function for oversampling the data does not work 
# for low-frequent NAICS classes. It also ranks the NAICS code in a descending way and
# label them as NAICS_RANK.

# Args:
#   DF (dataframe): An aggregated dataframe, per NAICS per application

# Returns: 
#  CLEANED_DF_UPD_JOIN (dataframe): A dataframe in which low-frequent NAICS codes are excluded
#  and some main variables have been put on first columns of the dataframe.

################################################################################


def EXTRACT(DF):
    COL = input("Insert the name of the NAICS column to be counted for ranking and filtering: ")
    NAICS_CODE_SUMMARY=DF.groupby(by=[COL]).count()

    NAICS_CODE_SUMMARY=DF.assign(
        NUM_FILINGS = 
        DF
        .groupby([COL])[COL].transform('count')
        
    )

    NAICS_CODE_SUMMARY=NAICS_CODE_SUMMARY[[COL,'NUM_FILINGS']]
    NAICS_CODE_SUMMARY=NAICS_CODE_SUMMARY.reset_index(drop=True)
    NAICS_CODE_SUMMARY=NAICS_CODE_SUMMARY.sort_values(by=['NUM_FILINGS'], ascending=False)
    NAICS_CODE_SUMMARY=NAICS_CODE_SUMMARY.drop_duplicates(subset=[COL])
    NAICS_CODE_SUMMARY=NAICS_CODE_SUMMARY[NAICS_CODE_SUMMARY['NUM_FILINGS']>10]
    NAICS_CODE_SUMMARY=NAICS_CODE_SUMMARY.reset_index(drop=True)
    NAICS_CODE_SUMMARY['NAICS_RANK']=NAICS_CODE_SUMMARY.index

    CLEANED_DF_UPD=DF.copy()
    CLEANED_DF_UPD=CLEANED_DF_UPD[CLEANED_DF_UPD[COL].isin(NAICS_CODE_SUMMARY[COL])]
    CLEANED_DF_UPD=CLEANED_DF_UPD.reset_index(drop=True)

    CLEANED_DF_UPD_JOIN=CLEANED_DF_UPD.merge(NAICS_CODE_SUMMARY,
                                on=[COL],
                                            how='left')
    # Rearranging some columns

    NAICS_RANK_TEMP = CLEANED_DF_UPD_JOIN['NAICS_RANK']
    CLEANED_DF_UPD_JOIN = CLEANED_DF_UPD_JOIN.drop(columns=['NAICS_RANK'])
    CLEANED_DF_UPD_JOIN.insert(loc=1, column='NAICS_RANK', value=NAICS_RANK_TEMP)

    NUM_FILINGS_TEMP = CLEANED_DF_UPD_JOIN['NUM_FILINGS']
    CLEANED_DF_UPD_JOIN = CLEANED_DF_UPD_JOIN.drop(columns=['NUM_FILINGS'])
    CLEANED_DF_UPD_JOIN.insert(loc=2, column='NUM_FILINGS', value=NUM_FILINGS_TEMP)

    YEAR_TEMP = CLEANED_DF_UPD_JOIN['YEAR']
    CLEANED_DF_UPD_JOIN = CLEANED_DF_UPD_JOIN.drop(columns=['YEAR'])
    CLEANED_DF_UPD_JOIN.insert(loc=3, column='YEAR', value=YEAR_TEMP)

    YEAR_TEMP = CLEANED_DF_UPD_JOIN['MONTH']
    CLEANED_DF_UPD_JOIN = CLEANED_DF_UPD_JOIN.drop(columns=['MONTH'])
    CLEANED_DF_UPD_JOIN.insert(loc=4, column='MONTH', value=YEAR_TEMP)
    
    return CLEANED_DF_UPD_JOIN
    
    

In [26]:
################################################################################
# 
# Making input variable function for Multi Layer Perceptron (MLP)
# Makes the input variable for MLP, based on the aggregated BERT embedding vector and some 
# categorical variables concatenated to the embedding vector.

# Args:
#   DF (dataframe): A dataframe with aggregated embedding vector and some 
# categorical variables such as te firm size, year, .... 

# Returns: 
#  X (A numpy array): A vector of the size of embedding vector plus some dummy variables
# concatenated to that. 
######################################################################################


def MLP_INPUT(DF):
    col_x = []
    for i in range(1024):
        col_x.append("X" + str(i+1))
        
    SEQUENCES_NP = DF[col_x].values.tolist()
    SEQUENCES_NP = np.array(SEQUENCES_NP)
    SEQUENCES_NP = SEQUENCES_NP.astype(np.float32)
    SEQUENCES_NP = torch.tensor(SEQUENCES_NP)

    sequences_np_stack_to_use = np.row_stack(SEQUENCES_NP)
    X_base_0 = sequences_np_stack_to_use.astype('float32')
    X_base=pd.DataFrame(X_base_0)
    X_base['index']=X_base.index
    
    DUMMY_LIST = []
    proc = True
    while proc:
        var = input("Enter a variable to be dummied, If not press 'no': ")
        if var == "no":
            proc = False
        else:
            DUMMY_LIST.append(var)
  
    X = X_base.copy()
    for var in DUMMY_LIST:
        VAR_DUMMY_DF = pd.get_dummies(DF[var])
        VAR_DUMMY_DF['index']=VAR_DUMMY_DF.index
        
        X = VAR_DUMMY_DF.merge(X,on=['index'],how='left')
        
    X = X.drop(columns=['index'])  
    X=X.to_numpy()
    X=X.astype('float32')

    return X

In [7]:
################################################################################
# 
# Making output variable function for Multi Layer Perceptron (MLP)
# Makes the output variable for MLP, based on the target variable (NAICS_RANK)in our dataframe. 

# Args:
#   DF (dataframe): A dataframe with a target variable (NAICS_RANK in our case).  

# Returns: 
#  Y5 (A numpy array): A vector of the size of the number of applications in our dataframe.
######################################################################################

def MLP_OUTPUT(DF):
    
    TARGET = input("Enter the name of target variable: ")
   
    Y5 = DF[TARGET].astype('int32')
    Y5 = np.array(list(Y5))
    
    return Y5



In [8]:
#################################################################

# Creates the structure of a Multi Layer Perceptron in which we have 2 hidden layers
# with relu functions on them as the activatuion function.
# The input layer size is our input vector that has already been created by MLP_INPUT function.
# The output layer size is our output vector that has already been created by MLP_OUTPUT function.
# The size the the first hidden layer will be identified later on and the size of the second 
# one is 128.
###################################################################

class Net(nn.Module):
    def __init__(self, input_size, hidden_size,  output5_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, 128) 
        #self.relu = nn.ReLU()
        
        self.fc7 = nn.Linear(128, output5_size)
        
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        
        out5 = self.fc7(out)
        return  out5
    

In [9]:
###################################################################

# Builds a model based on the NET class (MLP).

# Args:
#      X (A numpy array): The input vector of the MLP which is the BERT embedding vector 
#                         concatenated with some categorical variables.

#      Y5 (A numpy array): The output vector of the MLP which is the target variable (NAICS_RANK)
#                          for our classification task.
# return: 
#         model (an MLP): A deep neural network
#######################################################################

def BUILD_MODEL(X,Y5):
    #fitting parameters
    nb_features = 256
    Q = len(X[1])
    D5=len(np.unique(Y5))

    model = Net(Q, nb_features, D5)
    
    return model

In [10]:
#################################################################

# Evaluate the loss function in the last layer of MLP which is the cross entropy loss function.
# 
# Args: 
#   y_pred (A numpy array): The predicted target variable.
#   y_true (A numpy Array): The actual target variable.
#   log_vars (An scaler): An embedded scaler to the loss function to avoid overfitting

# Return:
#        loss (A number): The value of a loss function 
###################################################################

def criterion(y_pred, y_true, log_vars):
  
    precision = torch.exp(-log_vars)
        #diff = -logsoft_fun(y_pred[i]-y_true[i])
    
    loss_function = nn.CrossEntropyLoss()
    diff = loss_function(y_pred,  y_true.long())
        #diff = (y_pred[i]-y_true[i])**2.
        
    loss = precision * diff + log_vars
    
        
    return loss
#############################################################

# Shuffles the dataset
# Args:
#     X (a numpy array): A vector
#     Y5 (a numpy array): A vector
# Returns:
#     x[s] (a numpy array): A vector
#     Y5[s] (a numpy array): A vector

# Note that the vector X and Y5 should have the same length.
###############################################################

def shuffle_data(X,Y5):
    s = np.arange(X.shape[0])
    np.random.shuffle(s)
    return X[s],  Y5[s]




###################################################################

# Evaluates the loss function as well as the predicted target variablue.

# Args:
#      X (A numpy array): The input vector of the MLP which is the BERT embedding vector 
#                         concatenated with some categorical variables.

#      Y5 (A numpy array): The output vector of the MLP which is the target variable (NAICS_RANK)
#                          for our classification task.
# return: 
#         loss_val_avg (a number): The value of the loss function
#         predictions5 (a numpy array): the predicted target variable
#         true_vals5 (a numpy array): The true target variable
#######################################################################

def evaluate(X, Y5):

    model.eval()
    
    loss_val_total = 0
    predictions5,  true_vals5  =  [], []
    
    for j in range(len(X)//batch_size):
        
        
        
        inp = torch.from_numpy(X[(j*batch_size):((j+1)*batch_size)])
        
        target5 = torch.from_numpy(Y5[(j*batch_size):((j+1)*batch_size)])
        
        with torch.no_grad():        
            out = model(inp)
        
        
        
        loss = criterion(out, target5, log_var_e)
        
        loss_val_total += loss.item()
        
        
        predictions5.append(out.detach().numpy())
        
        true_vals5.append(target5.numpy())
        
    
    
    loss_val_avg = loss_val_total * batch_size /len(X)
    
    predictions5 = np.concatenate(predictions5, axis=0)
    
    
    true_vals5 = np.concatenate(true_vals5, axis=0)
    
            
    return loss_val_avg, predictions5,  true_vals5




##################################################################

# Evaluates the f1 score between two arrays.

# Args:
#      preds (A numpy array): the predicted target variable in which each element shows
#                           a vector that has as the same length as the number of classes
#                           and each element of the vector shows the probabilty of 
#                           an applicatio belonging to a class. 
#                           

#      labels (A numpy array): The true target variable which shows the groundtruth label
#                           (true class) of each application                     

# return: f1 score between two numpy arrays. 
#         
#######################################################################

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')







##################################################################

# Evaluates the Balance accuracy score between two arrays.

# Args:
#      preds (A numpy array): the predicted target variable in which each element shows
#                           a vector that has as the same length as the number of classes
#                           and each element of the vector shows the probabilty of 
#                           an applicatio belonging to a class. 
#                           

#      labels (A numpy array): The true target variable which shows the groundtruth label
#                           (true class) of each application                     

# return: Balance accuracy score between two numpy arrays. 
#         
#######################################################################

def BALANCED_ACCURACY(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return balanced_accuracy_score(labels_flat, preds_flat)



In [53]:
# Calculates the f1 score for each epoch for which the model is being trained.

# Args:
#   X_TRAIN (a numpy array): The training part of the input vector
#   Y5_TRAIN (a numpy array): The training part of the output vector
#   X_TEST (a numpy array): The testing part of the input vector
#   Y5_TEST (a numpy array): The testing part of the output vector
#   X_VAL (a numpy array): The validation part of the input vector
#   Y5_VAL (a numpy array): The validation part of the output vector

# Returns:
#         test_f1_5_list (a list of numbers): The calculated f1 score for each epoch

##############################################################################

def inner_cv_test(X_TRAIN,Y5_TRAIN,X_TEST,Y5_TEST,X_VAL,Y5_VAL):
    N = len(X_TRAIN)
    loss_history = np.zeros(nb_epoch)
    loss_history_val = np.zeros(nb_epoch)
    loss_history_test = np.zeros(nb_epoch)

    val_f1_5_list = np.zeros(nb_epoch)
    test_f1_5_list = np.zeros(nb_epoch)
    for i in range(nb_epoch):

        epoch_loss = 0
    
        X,  Y5 = shuffle_data(X_TRAIN, Y5_TRAIN)
        #X, Y1, Y2 = X_TRAIN, Y1_TRAIN, Y2_TRAIN
        for j in range(N//batch_size):

            optimizer.zero_grad()
            #print(X.shape[1])
            inp = torch.from_numpy(X[(j*batch_size):((j+1)*batch_size)])
            #print(inp.shape[1])
            target5 = torch.from_numpy(Y5[(j*batch_size):((j+1)*batch_size)])

            out = model(inp)

            loss = criterion(out, target5, log_var_e)

            epoch_loss += loss.item()

            loss.backward()

            optimizer.step()


        loss_history[i] = epoch_loss * batch_size / N 

        loss_val_avg, predictions5, true_vals5 = evaluate(X_VAL, Y5_VAL)
        #print("Validation loss: ", loss_val_avg)
        loss_history_val[i] = loss_val_avg


        val_f1_5 = f1_score_func(predictions5, true_vals5)
        val_f1_5_list[i] = val_f1_5



        ##############
        loss_test_avg, predictions5_test, true_test5 = evaluate(X_TEST, Y5_TEST)
        #print("Validation loss: ", loss_val_avg)
        loss_history_test[i] = loss_test_avg


        test_f1_5 = f1_score_func(predictions5_test, true_test5)
        #print(predictions5_test)
        test_f1_5_list[i] = test_f1_5
        #################

        print("Epoch: ", i+1)

        print("F1 Score (Weighted) for NAICS in validation set: ", val_f1_5)
        print("F1 Score (Weighted) for NAICS in test set: ", test_f1_5)
        
    
    return test_f1_5_list[i]






######################################################################

# Oversamples the dataset in order to have an imbalanced dataset using SMOTE() function and
# splitting the data into train, validation and test parts and then call the inner_cv_test()
# function to train the model

# Args:
#      X (A numpy array): The input vector of the MLP which is the BERT embedding vector 
#                         concatenated with some categorical variables.

#      Y5 (A numpy array): The output vector of the MLP which is the target variable (NAICS_RANK)
#                          for our classification task.
# Returns: 
#         return_vec (a list of number): The calculated f1 score for each epoch.

############################################################################


def iteration_cv_run(X,Y5):
    

    X_TRAIN_VAL, X_TEST, Y5_TRAIN_VAL, Y5_TEST = train_test_split(X, Y5,
        test_size=0.1, shuffle = True)

    # transform the dataset
    oversample = SMOTE()
    X_TRAIN_VAL, Y5_TRAIN_VAL = oversample.fit_resample(X_TRAIN_VAL, Y5_TRAIN_VAL)


    X_TRAIN, X_VAL, Y5_TRAIN, Y5_VAL = train_test_split(X_TRAIN_VAL, Y5_TRAIN_VAL, 
        test_size=0.2)
    


    return_vec=inner_cv_test(X_TRAIN,Y5_TRAIN,X_TEST,Y5_TEST,X_VAL,Y5_VAL)
    return return_vec


In [31]:
CLEANED_DF = PREPROCESSING(MAIN_DF)
DF_ = AGGREGATE(CLEANED_DF)
CLEANED_DF_UPD_JOIN = EXTRACT(DF_)


Insert the name of the NAICS column to be cleaned: NAICS_CODE


  res_values = method(rvalues)


Insert the name of the date column: FILING_DATE
Insert the name of the unique file number column: UNIQUE_FILE_NUMBER
Insert the name of the NAICS column to be counted for ranking and filtering: NAICS_CODE


In [32]:
X = MLP_INPUT(CLEANED_DF_UPD_JOIN)
Y5 = MLP_OUTPUT(CLEANED_DF_UPD_JOIN)

Enter a variable to be dummied, If not press 'no': EMPLOYMENT_CODE_12
Enter a variable to be dummied, If not press 'no': YEAR
Enter a variable to be dummied, If not press 'no': no
Enter the name of target variable: NAICS_RANK


In [46]:
# fits the model on the dataset.
# Args:
#      X (A numpy array): The input vector of the MLP which is the BERT embedding vector 
#                         concatenated with some categorical variables.

#      Y5 (A numpy array): The output vector of the MLP which is the target variable (NAICS_RANK)
#                          for our classification task.
# Returns: 
#         model (a deep neural network): A fitted model based on our dataset.

############################################################################



def FIT_MODEL(X, Y5):
    
    global model
    model = BUILD_MODEL(X,Y5)
    
    global nb_epoch
    nb_epoch = 30
    global batch_size 
    batch_size = 64
    global log_var_e 
    log_var_e = torch.zeros((1,), requires_grad=True)
    global params 
    params = ([p for p in model.parameters()] +  [log_var_e] )
    global optimizer 
    optimizer = optim.SGD(params, lr=0.001, momentum=0.9)
    
    seq_len = 11
    desired_runs_seq=range(0,seq_len)
    for k in desired_runs_seq:
        print(k)
        if k == seq_len-1:
            nb_epoch = 1
        iteration_cv_run(X,Y5)
            
    return model

In [47]:
model_fitted = FIT_MODEL(X, Y5)

0
1043
1043
Epoch:  1
F1 Score (Weighted) for NAICS in validation set:  0.007152676605913454
F1 Score (Weighted) for NAICS in test set:  0.0027007095899665005
Epoch:  2
F1 Score (Weighted) for NAICS in validation set:  0.08091815725728542
F1 Score (Weighted) for NAICS in test set:  0.01666598907177884
Epoch:  3
F1 Score (Weighted) for NAICS in validation set:  0.20851943028551662
F1 Score (Weighted) for NAICS in test set:  0.04173665537181054
Epoch:  4
F1 Score (Weighted) for NAICS in validation set:  0.319711881476673
F1 Score (Weighted) for NAICS in test set:  0.07933928715408299
Epoch:  5
F1 Score (Weighted) for NAICS in validation set:  0.4093881509225944
F1 Score (Weighted) for NAICS in test set:  0.10273811193477371
Epoch:  6
F1 Score (Weighted) for NAICS in validation set:  0.48717857910190787
F1 Score (Weighted) for NAICS in test set:  0.12494850681224513
Epoch:  7
F1 Score (Weighted) for NAICS in validation set:  0.5587378146566173
F1 Score (Weighted) for NAICS in test set:  0

Epoch:  29
F1 Score (Weighted) for NAICS in validation set:  0.9123595001464282
F1 Score (Weighted) for NAICS in test set:  0.31948743428858967
Epoch:  30
F1 Score (Weighted) for NAICS in validation set:  0.9171564821071955
F1 Score (Weighted) for NAICS in test set:  0.3163187400416933
2
1043
1043
Epoch:  1
F1 Score (Weighted) for NAICS in validation set:  0.9069677722372816
F1 Score (Weighted) for NAICS in test set:  0.46812858538878427
Epoch:  2
F1 Score (Weighted) for NAICS in validation set:  0.9137886774772201
F1 Score (Weighted) for NAICS in test set:  0.4481046183081877
Epoch:  3
F1 Score (Weighted) for NAICS in validation set:  0.8965853131710932
F1 Score (Weighted) for NAICS in test set:  0.4181988568738318
Epoch:  4
F1 Score (Weighted) for NAICS in validation set:  0.8928340650616954
F1 Score (Weighted) for NAICS in test set:  0.4037642533944893
Epoch:  5
F1 Score (Weighted) for NAICS in validation set:  0.9053282720389669
F1 Score (Weighted) for NAICS in test set:  0.4022114

Epoch:  27
F1 Score (Weighted) for NAICS in validation set:  0.9317521721790181
F1 Score (Weighted) for NAICS in test set:  0.38659960204587407
Epoch:  28
F1 Score (Weighted) for NAICS in validation set:  0.9266970719779851
F1 Score (Weighted) for NAICS in test set:  0.38680452339565824
Epoch:  29
F1 Score (Weighted) for NAICS in validation set:  0.933113787708623
F1 Score (Weighted) for NAICS in test set:  0.37632505381796894
Epoch:  30
F1 Score (Weighted) for NAICS in validation set:  0.9216134724389119
F1 Score (Weighted) for NAICS in test set:  0.36749051770436236
4
1043
1043
Epoch:  1
F1 Score (Weighted) for NAICS in validation set:  0.9341155415778113
F1 Score (Weighted) for NAICS in test set:  0.5515716391314316
Epoch:  2
F1 Score (Weighted) for NAICS in validation set:  0.9338656407208485
F1 Score (Weighted) for NAICS in test set:  0.5286434758156192
Epoch:  3
F1 Score (Weighted) for NAICS in validation set:  0.9354743335041057
F1 Score (Weighted) for NAICS in test set:  0.5097

Epoch:  25
F1 Score (Weighted) for NAICS in validation set:  0.9353449349083293
F1 Score (Weighted) for NAICS in test set:  0.4337039438417284
Epoch:  26
F1 Score (Weighted) for NAICS in validation set:  0.9387246679072857
F1 Score (Weighted) for NAICS in test set:  0.43204488980131006
Epoch:  27
F1 Score (Weighted) for NAICS in validation set:  0.9418394899756602
F1 Score (Weighted) for NAICS in test set:  0.4313327436734532
Epoch:  28
F1 Score (Weighted) for NAICS in validation set:  0.9341380959837691
F1 Score (Weighted) for NAICS in test set:  0.42177384364382575
Epoch:  29
F1 Score (Weighted) for NAICS in validation set:  0.930799439515769
F1 Score (Weighted) for NAICS in test set:  0.41443256896368613
Epoch:  30
F1 Score (Weighted) for NAICS in validation set:  0.9319073684188444
F1 Score (Weighted) for NAICS in test set:  0.4173956712758713
6
1043
1043
Epoch:  1
F1 Score (Weighted) for NAICS in validation set:  0.9324721644483607
F1 Score (Weighted) for NAICS in test set:  0.567

Epoch:  23
F1 Score (Weighted) for NAICS in validation set:  0.9170432444501736
F1 Score (Weighted) for NAICS in test set:  0.4363671958754768
Epoch:  24
F1 Score (Weighted) for NAICS in validation set:  0.9353067499155446
F1 Score (Weighted) for NAICS in test set:  0.45088446616739636
Epoch:  25
F1 Score (Weighted) for NAICS in validation set:  0.9377890850885605
F1 Score (Weighted) for NAICS in test set:  0.4556510747256539
Epoch:  26
F1 Score (Weighted) for NAICS in validation set:  0.9393050929857064
F1 Score (Weighted) for NAICS in test set:  0.4475508656820148
Epoch:  27
F1 Score (Weighted) for NAICS in validation set:  0.9379256770261091
F1 Score (Weighted) for NAICS in test set:  0.4365435640886884
Epoch:  28
F1 Score (Weighted) for NAICS in validation set:  0.9370757087098327
F1 Score (Weighted) for NAICS in test set:  0.443195739516284
Epoch:  29
F1 Score (Weighted) for NAICS in validation set:  0.9376323304055055
F1 Score (Weighted) for NAICS in test set:  0.4484803990263534

Epoch:  21
F1 Score (Weighted) for NAICS in validation set:  0.9467936925155629
F1 Score (Weighted) for NAICS in test set:  0.49199680503413135
Epoch:  22
F1 Score (Weighted) for NAICS in validation set:  0.9466102569485664
F1 Score (Weighted) for NAICS in test set:  0.4958504414227895
Epoch:  23
F1 Score (Weighted) for NAICS in validation set:  0.9296983104637712
F1 Score (Weighted) for NAICS in test set:  0.4639772092514817
Epoch:  24
F1 Score (Weighted) for NAICS in validation set:  0.9465052102505102
F1 Score (Weighted) for NAICS in test set:  0.4932287295740869
Epoch:  25
F1 Score (Weighted) for NAICS in validation set:  0.93915557981189
F1 Score (Weighted) for NAICS in test set:  0.4773199776319398
Epoch:  26
F1 Score (Weighted) for NAICS in validation set:  0.9511108658849388
F1 Score (Weighted) for NAICS in test set:  0.5031274144634468
Epoch:  27
F1 Score (Weighted) for NAICS in validation set:  0.9447263243575202
F1 Score (Weighted) for NAICS in test set:  0.4842191919459426


In [48]:
# Shows the f1 score, Matthews Correction Coefficient (MCC) and balance accuracy
# of the pretrained model on a test dataset

# Args:
#     model (a classifier model): A model which has been trained already to classify
#                               the NAICS code for the application
 #    X (A numpy array): The input vector of the MLP which is the BERT embedding vector 
#                         concatenated with some categorical variables.

#     Y5 (A numpy array): The output vector of the MLP which is the target variable (NAICS_RANK)
#                          for our classification task.

###################################################################

def TEST_MODEL(model,X, Y5):
    X_TRAIN_VAL, X_TEST, Y5_TRAIN_VAL, Y5_TEST = train_test_split(X, Y5,
        test_size=0.1, shuffle = True)

    X_VAR=X_TEST
    Y_VAR=Y5_TEST
    model.eval()

    loss_val_total = 0
    predictions5,  true_vals5  =  [], []
    last_results_prediction=[]   
    last_results_target5=[]

    obs_index=[]


    for j in range(len(X_VAR)//batch_size):
        inp = torch.from_numpy(X_VAR[(j*batch_size):((j+1)*batch_size)])
        #print(inp.shape)
        target5 = torch.from_numpy(Y_VAR[(j*batch_size):((j+1)*batch_size)])

        with torch.no_grad():        
                out = model(inp)

        #print(out.detach().numpy().shape)
        loss = criterion(out, target5, log_var_e)
        #print(loss)
        loss_val_total += loss.item()


        predictions5.append(out.detach().numpy())

        last_results_prediction=out.detach().numpy()
        last_results_target5=target5.numpy()
        true_vals5.append(target5.numpy())
        temp_obs_index=range(j*batch_size,(j+1)*batch_size)
        obs_index.append(temp_obs_index)


    loss_val_avg = loss_val_total * batch_size /len(X_VAR)
    predictions5 = np.concatenate(predictions5, axis=0)
    true_vals5 = np.concatenate(true_vals5, axis=0)

    print("F1_score is: ",f1_score_func(predictions5, true_vals5))
    print("Matthews Correction Coefficient (MCC) is: ", matthews_corrcoef(np.argmax(predictions5, axis=1).flatten(), true_vals5))
    print("Balanced Accuracy is: ",BALANCED_ACCURACY(predictions5, true_vals5))



In [50]:
TEST_MODEL(model_fitted,X, Y5)

F1_score is:  0.6439046715848052
Matthews Correction Coefficient (MCC) is:  0.6666092483500562
Balanced Accuracy is:  0.8933052004594522


