#### ===== Importing Libaries =====

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from tabulate import tabulate
from sklearn.utils import class_weight
from collections import Counter
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
import shap
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder

## Importing from other files
from text_processing_cleanup import text_Processing, text_Processing_GloVe
from data_feature_functions import create_indicator_matrix, get_Fraud_Dataset,get_training_testing_data, get_Geolocation_Data, print_class_report_confusion_matrix, get_combined_feature, export_predictions
from predict_CNN_1layer import get_cnn_pred_prob, evaluate_cnn
from ensemble_classifier import train_bagging_ensemble, train_boosting_ensemble, perfrom_GridSearch, perfrom_RandomSearch
from word_vector_functions import  load_word2vec, get_sentence_feature_values, read_glove_file, get_sentence_embeddings
from get_classifier_models import get_classifier_predictions_probabilities
from data_visualization import plot_3d, plot_decision_regions, get_feature_importance

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


#### Variable Parameters

In [2]:
test_size = 38 # out of 100 percent for train-test split
test_percentage = float(test_size / 100.00)
rand_state = 47 # random state of train/test split
embedding_dim  = 200

text_type = "Clean_Nums_2char"
glove_sum = "VecAvg"
recreate_full_xgb = False

#### Getting fraud data and GloVe embeddings

In [3]:
fraud_data = get_Fraud_Dataset(recreate_features = False, fraud_type= 'acceptance')

text = fraud_data['Fraud_Text'] # features or inputs into model
labels = fraud_data['Fraud_Label'] # labels

labels_onehot = create_indicator_matrix(fraud_data['Fraud_Label'], check_index = True)


cleaned_text = text.apply(text_Processing, numbers=False)

text_word_vec = text.apply(text_Processing_GloVe)

clean_list = cleaned_text.tolist()

clean_wordVec_list = text_word_vec.tolist()

glove_dict = read_glove_file(data_type = "accept", dimension = embedding_dim,  vocab_size = "4k", glove_type = text_type)


new_feat_avg = fraud_data.loc[:, 'Loss_PolicyEff':'Claim_Loss',] # Datediff features + fraud text
new_feature_datediff = fraud_data.loc[:, 'Loss_PolicyEff':'Claim_Loss'] # Datediff features


new_feature_labels = fraud_data['Fraud_Label']
fraud_data['Main Cause'].fillna("", inplace=True)
fraud_data['Sub Cause'].fillna("", inplace=True)
fraud_data['New_City'].fillna("", inplace=True)
fraud_data['Longitude'].fillna(0, inplace=True)
fraud_data['Latitude'].fillna(0, inplace=True)

fraud_data['New_Main'] = fraud_data['Main Cause']
fraud_data['New_Sub'] = fraud_data['Sub Cause']
fraud_data['Sub_Business'] = fraud_data['Subline_Business']
fraud_data['Cause'] = fraud_data['Main Cause'] + " - " + fraud_data['Sub Cause']
fraud_data['Cause_Join'] = fraud_data['Main Cause'] + " - " + fraud_data['Sub Cause']

Initial data path: C:\Current_Projects\Fraud_Project_AI\Initial_Datasets 
 Local Fraud Data Path: C:\Current_Projects\Fraud_Project_AI\Initial_Datasets\MasterFraudData_more datav2.xlsx 
 Feature Gen Path: C:\Current_Projects\Fraud_Project_AI\Feature_Generated_Datasets\Fraud_Features_NoLossDate_Refferal.xlsx
Initial Large dataframe size before drop 79432

Initial Large dataframe size after drop 17066

Type of zip code data: <class 'pandas.core.frame.DataFrame'>

Loading Lat/Long
Type of city data: <class 'pandas.core.frame.DataFrame'>
Initial Large dataframe size after merge 17066



NameError: name 'multi_body_parts_feature' is not defined

#### Encoding features and getting Glove Average for each sentence

In [None]:
# Encoding main/sub cause columns
encoder = ce.leave_one_out.LeaveOneOutEncoder(cols = ['Cause',  'Sub_Business', 'New_Main', 'New_Sub', 'New_City'])


# encoder = ce.BackwardDifferenceEncoder(cols = ['New_Main', 'New_Sub'])

fraud_data = encoder.fit_transform(X=fraud_data, y=fraud_data['Fraud_Label'])



feature_cols = [

               # 'Loss_PolicyEff',
               'Loss_PolicyExp',
                'Claim_PolicyEff',
                'Claim_Loss',
                'Longitude',
                'Latitude',
                'Sub_Business',
                'Cause']

new_feat_avg = fraud_data[feature_cols]

loss_cleaned = fraud_data['Fraud_Text'].apply(text_Processing, numbers=False)

sentence_embedding_avg = np.array( [get_sentence_feature_values(sentence = words, embedding = glove_dict, embedding_dim = 200) for words in loss_cleaned ])

sent_orig_sum = sentence_embedding_avg.sum(axis = 1) # getting sum of sentece embeddings along row

sent_embed_loss = get_sentence_embeddings(text = loss_cleaned, embedding = glove_dict, embedding_size = embedding_dim)

sent_sum_loss = [sent.sum() for sent in sent_embed_loss]

sent_avg_loss = [np.average(sent) for sent in sent_embed_loss]


features_orig = [
                   # 'Loss_PolicyEff',
                   'Loss_PolicyExp',
                    'Claim_PolicyEff',
                    'Claim_Loss',
                    'Longitude',
                    'Latitude',
                    'Subline_Business',
                    'Cause_Join',
                    'Loss_Description'
                    ]
new_feat_orig = fraud_data[features_orig]

new_feat_avg['Loss_Descrip'] = sent_avg_loss

fraud_data['Loss_Glove_Avg'] = sent_avg_loss

new_feat_orig.rename(
                columns= {
                    'Main Cause': 'Main_Cause',
                    'Sub Cause': 'Sub_Cause',
                    'Loss_Description': 'Loss_Descrip'
                },
                inplace=True
    )

#### Creating dataframe with 200 dimensional Glove Average

In [None]:
sent_embed_df = pd.DataFrame(sent_embed_loss, columns = ['Loss_Descrip_Feat_' + str(feat) for feat in range( len(sent_embed_loss[0]) ) ] )

sent_embed_df.set_index(new_feature_datediff.index, inplace=True)


new_feature_expand = pd.concat([new_feature_datediff, fraud_data['Cause'],  fraud_data['Longitude'], fraud_data['Latitude']], axis=1)

new_feature_expand = new_feature_expand.join([sent_embed_df ], how="inner")

#### Train-test split

In [None]:
X_train_avg, X_test_avg, y_train_avg, y_test_avg = train_test_split(new_feat_avg, new_feature_labels, test_size = test_percentage, random_state = rand_state)

X_train_expand, X_test_expand, y_train_expand, y_test_expand = train_test_split(new_feature_expand, new_feature_labels, test_size = test_percentage, random_state = rand_state)

X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(new_feat_orig, new_feature_labels, test_size = test_percentage, random_state = rand_state)


#### Training xgboost model and exporting predictions to excel file

In [None]:
boost_pred, boost_prob, boost_model, boost_pred_path = train_boosting_ensemble(X_train_avg, X_test_avg, y_train_avg, y_test_avg,
                                                                                  boosting_type= "xgboost_{}test_{}_FullData".format(test_size, 'VecAvg'),
                                                                                  recreate_model= True, model_type = 'imbalanced')

boost_pred_expand, boost_prob_expand, boost_model_expand, boost_pred_path_expand = train_boosting_ensemble(X_train_expand, X_test_expand, y_train_expand, y_test_expand,
                                                                                  boosting_type= "xgboost_{}test_Glove{}_FullData".format(test_size, 'Expand'),
                                                                                  recreate_model= True, model_type = 'imbalanced')
    
class_xgb =  print_class_report_confusion_matrix(y_test_avg, boost_pred, "XGBoost", "Glove Sum Full Data")

class_xgb_expand =  print_class_report_confusion_matrix(y_test_expand, boost_pred_expand, "XGBoost", "Glove Expand Full Data")


boost_out = export_predictions(
                  fraud_data,
                  boost_prob,
                  boost_pred,
                  actual= y_test_avg,
                  recreate_ProbPreds = True,
                  pred_path = boost_pred_path,
                  file_name = 'XGBoost_Output_{}test_{}{}_FullData'.format(test_size, text_type, glove_sum),
                  model_type ='XGBoost')




#### Plotting xgboost outputs and getting feature impact/importance

In [None]:
# plot_data_points = { 'x':boost_out['Actual Label'] , 'y':boost_out['XGBoost_Predictions'] , 'z': boost_out['XGBoost_Confid_Prob'] * 100.00}
# plot_3d(model_output= boost_out,  data_points = plot_data_points, fig_type = "Boost_FullData_Actual_Pred_Prob", model_type = "XGBoost",  z_label = "Probability Fraud (%)")


# plot_data_points = { 'x':boost_out['Actual Label'] , 'y':boost_out['XGBoost_Predictions'] , 'z': boost_out['Claim_Loss'] }
# plot_3d(model_output= boost_out,  data_points = plot_data_points, fig_type = "Boost_Full_Actual_Pred_ClaimLoss", model_type = "XGBoost", z_label = "Days Between Policy Loss - Claim")

# get_feature_importance(model = boost_model,
#                            features = X_test_avg,
#                            feature_names = list(X_test_avg.columns),
#                            orig_feat = X_test_orig, 
#                            model_type = 'xgb')

get_feature_importance(model = boost_model,
                           features = X_test_avg,
                           feature_names = list(X_test_avg.columns),
                           orig_feat = X_test_orig, 
                           model_type = 'xgb',
                           plot = 'decision')

get_feature_importance(model = boost_model,
                           features = X_test_avg,
                           feature_names = list(X_test_avg.columns),
                           orig_feat = X_test_orig, 
                           model_type = 'xgb',
                           plot = 'force')

#### Getting CNN output and combining output to dataset

In [None]:
cnn_pred, cnn_prob, cnn_out_path = get_cnn_pred_prob(cleaned_text = cleaned_text, labels = labels_onehot,
                                                         testing = True,
                                                         prob_type='{}test'.format(test_size),
                                                         test_size = test_size,
                                                         rand_state = rand_state,
                                                         glove_type = text_type,
                                                         embedding_dim = embedding_dim,
                                                         recreate_prob = False)


# evaluate_cnn(cleaned_text = cleaned_text, labels = labels_onehot)

new_feature = get_combined_feature(
                                features =  fraud_data,
                                cnn_Prob = cnn_out_path,
                                recreate_combined_features = True,
                                feature_type = '{}test_{}_{}D'.format(test_size, text_type, embedding_dim)
)



new_feature_datediff = new_feature.loc[:, 'Loss_PolicyEff':'Multi_Body_Parts_Injured'] # Datediff features
new_feat_avg = new_feature.loc[:, 'Loss_PolicyEff':'Multi_Body_Parts_Injured'] # Datediff features + fraud text
new_feature = new_feature.drop(['Unnamed: 0'], axis=1)

cnn_features = [
                 # 'Loss_PolicyEff',
                 'Loss_PolicyExp',
                  'Claim_PolicyEff',
                  'Claim_Loss',
                  'New_Main',
                  'New_Sub',
                  'Longitude',
                  'Latitude',
                  'Sub_Business',
                  'CNN_Prob_Fraud'
            ]

new_feature_xgb = new_feature[cnn_features]

new_feature_labels = new_feature['Fraud_Label']
new_feature['Main Cause'].fillna("", inplace=True)
new_feature['Sub Cause'].fillna("", inplace=True)
new_feature['New_City'].fillna("", inplace=True)
new_feature['Longitude'].fillna(0, inplace=True)
new_feature['Latitude'].fillna(0, inplace=True)

loss_cleaned = new_feature['Loss_Descrip_NoAddition'].apply(text_Processing, numbers=False)