In [15]:
import sys, os, re, csv, random
import pandas as pd
import spacy
import numpy as np
import statistics
#import xgboost as xgb
from imblearn.over_sampling import SMOTE
from collections import Counter
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [16]:
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from scipy.stats import spearmanr
import copy

In [21]:
def compute_correlation(original_data, label_list, fake_data):

  corr_list = []

  majority_original = []

  for i in range(len(label_list)):
    if int(label_list[i])==1:
      majority_original.append(original_data[i])
  
  majority_original = np.array(majority_original)
  mean_majority_vec = np.mean(majority_original, axis=0)

  sum_corr = 0
  for i in range(fake_data.shape[0]):
    corr, _ = spearmanr(mean_majority_vec, fake_data[i])
    sum_corr += corr
  return sum_corr/float(fake_data.shape[0])

def compare_datasets(original_dataset, fake_dataset):
  
  column_list = []
  original_index = {1}
  fake_index = {original_dataset.shape[0]+1}

  for i in range(1, original_dataset.shape[0]):
    original_index.add(i+1)
  
  for i in range(1, fake_dataset.shape[0]):
    fake_index.add(original_dataset.shape[0]+i+1)
  
  for i in range(41):
    column_list.append("OC_"+ str(i+1))

  original_df = pd.DataFrame(original_dataset, columns=column_list, index=original_index)
  original_df.drop_duplicates()
  fake_df = pd.DataFrame(fake_dataset, columns=column_list, index=fake_index)
  fake_df.drop_duplicates()


  #original_df.reset_index(drop=True)
  #fake_df.reset_index(drop=True)

  #print(original_df.shape)
  #print(fake_df.shape)


  table_evaluator = TableEvaluator(original_df, fake_df)
  table_evaluator.visual_evaluation()

def apply_gaussian_noise(data_list, label_list):
  original_dataset = np.copy(data_list)

  index_list = []
  for i in range(len(label_list)):
    if int(label_list[i])==1:
      index_list.append(i)
  
  for i in range(len(index_list)-1):
    for j in range(i+1, len(index_list)):
      
      rand_el = random.choice([i, j])
      max_num = np.max(data_list[index_list[rand_el]])
      min_num = np.min(data_list[index_list[rand_el]])
      if abs(min_num) >= max_num:
        gauss = np.random.normal(0,(float(max_num)/float(2)),data_list[index_list[rand_el]].shape)
      else:
        gauss = np.random.normal(0,(float(abs(min_num))/float(2)),data_list[index_list[rand_el]].shape)
      #print(gauss)

      new_data_row = data_list[index_list[rand_el]] + gauss

      new_data_row = np.reshape(new_data_row, (1, new_data_row.shape[0]))

      if i==0 and j==1:
        fake_dataset = np.copy(new_data_row)
      else:
        fake_dataset = np.append(fake_dataset, new_data_row , axis=0)  
        

      data_list = np.append(data_list, new_data_row , axis=0)
      label_list = np.append(label_list, 1)        
    
    return data_list, label_list, fake_dataset


def apply_majority_oversampling(data_list, label_list):

    original_dataset = np.copy(data_list)

    index_list = []
    for i in range(len(label_list)):
      if int(label_list[i])==1:
        index_list.append(i)
    
    for i in range(len(index_list)-1):
      for j in range(i+1, len(index_list)):
        ratio = random.random()
        while ratio==0 or ratio==1:
          ratio = random.random()

        new_data_row = ratio*data_list[index_list[i]] + (1-ratio)*data_list[index_list[j]]

        new_data_row = np.reshape(new_data_row, (1, new_data_row.shape[0]))

        if i==0 and j==1:
          fake_dataset = np.copy(new_data_row)
        else:
          fake_dataset = np.append(fake_dataset, new_data_row , axis=0)  
        

        data_list = np.append(data_list, new_data_row , axis=0)
        label_list = np.append(label_list, 1)        



    #compare_datasets(original_dataset, fake_dataset)
    
    return data_list, label_list, fake_dataset



def apply_smote(data_list, label_list):

    #print("Before Count: ", Counter(label_list))
    
    # Apply majority oversampling or gaussina noise
    original_data_list = np.copy(data_list)
    original_label_list = copy.deepcopy(label_list)

    #data_list, label_list, fake_list = apply_majority_oversampling(data_list, label_list)
    data_list, label_list, fake_list = apply_gaussian_noise(data_list, label_list)
    
    #print(data_list.shape)
    # Apply SMOTE
    transformed_data_list = np.copy(data_list)
    #print("After Majority Oversampling Count: ", Counter(label_list))
    orig_shape = transformed_data_list.shape
    transformed_label_list = []

    
    for i in range(0,transformed_data_list.shape[0]):
          transformed_label_list.append(int(label_list[i]))    

    #print("Original Shape: ", orig_shape)
    


    oversample = SMOTE(k_neighbors=1)
    transformed_data_list, transformed_label_list = oversample.fit_resample(transformed_data_list, transformed_label_list)
    #print(len(transformed_label_list))
    #print(len(transformed_data_list))
    #print(transformed_label_list)
    #print(label_list)
    added_num = int(transformed_data_list.shape[0]) - int(data_list.shape[0]) 


    label_list = np.append(label_list, np.zeros(added_num))
    #print("Updated Label List: ", label_list)

    corr_num = compute_correlation(original_data_list, original_label_list, fake_list)

    #print("After Count: ", Counter(label_list))

    return transformed_data_list, label_list, corr_num

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

classifier_type = 'svm'


data_list = np.load('data_file.npy')
label_list = np.load('label_file.npy')


print(data_list.shape)

round_acc_arr = []
round_rec_arr = []
round_pre_arr = []
round_f1_arr = []
round_f1_micro_arr = []
round_corr_arr_train = []
round_corr_arr_test = []


round_acc_cum = 0
round_rec_cum = 0
round_pre_cum = 0
round_f1_cum = 0
round_f1_micro_cum = 0

round_range = 1000

#-------------------------------------------


for round_num in range(round_range):

  acc_cum = 0
  rec_cum = 0
  pre_cum = 0
  f1_cum = 0
  f1_micro_cum = 0
  acc_arr = []
  rec_arr = []
  pre_arr = []
  f1_arr = []
  f1_micro_arr = []
  predicted_label_arr = []
  test_label_arr = []
  error_analysis = []
  fold_number = 1

  for train_index, test_index in skf.split(data_list, label_list):
      X_train, X_test = data_list[train_index], data_list[test_index]
      y_train, y_test = label_list[train_index], label_list[test_index]

      X_train, y_train, corr_train = apply_smote(X_train, y_train)
      X_test, y_test, corr_test = apply_smote(X_test, y_test)

      features_list = ['achievement', 'achievement/effort', 'adaptability/flexibility', 'assisting and caring for others', 'coaching and developing others', 'concern for others',
                      'consequence of error', 'conventional', 'cooperation', 'developing and building teams', 'enterprising', 'establishing and maintaining interpersonal relationships',
                      'face-to-face discussions', 'freedom to make decisions', 'frequency of conflict situations', 'frequency of decision making',
                      'guiding, directing, and motivating subordinates', 'importance of being exact or accurate', 'independence', 'independence_2', 'initiative', 'instructing', 'integrity', 'leadership',
                      'level of competition', 'monitoring and controlling resources', 'recognition', 'relationships', 'resolving conflicts and negotiating with others',
                      'responsibility for outcomes and results', 'self control', 'service orientation', 'social', 'social orientation', 'stress tolerance',
                      'structured versus unstructured work', 'support', 'training and teaching others', 'work schedules', 'work with work group or team', 'working conditions']
                      
      #print(len(features_list))

      if classifier_type=='xgboost':
          total_X = np.append(X_train, X_test, axis=0)
          total_y = np.append(y_train, y_test, axis=0)
          clf = xgb.XGBClassifier(objective="binary:logistic", colsample_bytree=0.5, gamma=0.25, learning_rate=0.1, max_depth=5, reg_lambda=1, scale_pos_weight=5, subsample=0.8, random_state=42)
      elif classifier_type=='lr':
        clf = LogisticRegression(random_state=42, penalty='l2',solver='liblinear', max_iter=100, C=0.5)
      
      else:
        clf = svm.SVC(random_state=42, C=0.5, kernel='linear')
      


      clf.fit(X_train, y_train)


      predict_list = []
      original_list = []
      for i in range(X_test.shape[0]):
          predicted_label = clf.predict([X_test[i][:]])
          #print(predicted_label)
          predict_list.append(str(predicted_label[0]))
          original_list.append(str(y_test[i]))
      
      acc_arr.append(accuracy_score(original_list, predict_list))
      acc_cum += acc_arr[fold_number-1]
      rec_arr.append(recall_score(original_list, predict_list, average='macro'))
      rec_cum += rec_arr[fold_number-1]
      pre_arr.append(precision_score(original_list, predict_list, average='macro'))
      pre_cum += pre_arr[fold_number-1]
      f1_arr.append(f1_score(original_list, predict_list, average='macro'))
      f1_cum  += f1_arr[fold_number-1]
      f1_micro_arr.append(f1_score(original_list, predict_list, average='micro'))
      f1_micro_cum  += f1_micro_arr[fold_number-1]

      fold_number += 1


  round_acc_cum += acc_cum/5
  round_acc_arr.append(acc_cum/5)

  round_rec_cum += rec_cum/5
  round_rec_arr.append(rec_cum/5)
  
  round_pre_cum += pre_cum/5
  round_pre_arr.append(pre_cum/5)
  
  round_f1_cum += f1_cum/5
  round_f1_arr.append(f1_cum/5)
  
  round_f1_micro_cum += f1_micro_cum/5
  round_f1_micro_arr.append(f1_micro_cum/5)

  #print(corr_train)
  round_corr_arr_train.append(corr_train)
  round_corr_arr_test.append(corr_test)



round_corr_arr_train = np.array(round_corr_arr_train)
round_corr_arr_test = np.array(round_corr_arr_test)

print("Train Correlation: ", np.mean(round_corr_arr_train,axis=0))
print("Test Correlation: ", np.mean(round_corr_arr_test,axis=0))

print("Accuracy: ", round_acc_cum/round_range)
print("Recall: ", round_rec_cum/round_range)
print("Precision: ", round_pre_cum/round_range)
print("F1 score(macro): ", round_f1_cum/round_range)
print("F1 score(micro): ", round_f1_micro_cum/round_range)

print("Accuracy_stdev: ", statistics.stdev(round_acc_arr))
print("Recall_stdev: ", statistics.stdev(round_rec_arr))
print("Precision_stdev: ", statistics.stdev(round_pre_arr))
print("F1(macro) score_stdev: ", statistics.stdev(round_f1_arr))
print("F1(micro) score_stdev: ", statistics.stdev(round_f1_micro_arr))

    