# ------ Import Modules ------

In [1]:
# import necessary modules
import os
import pandas as pd
import numpy as np
import warnings
import json
from machine_learning import MachineLearning, random_tune, xgb_tune, lgb_tune, xgb, lgb
from config import GetDict, SetDict
from feature_importance import FeatureImportance
import h2o
from h2o.automl import H2OAutoML

# suppress warnings
warnings.filterwarnings('ignore')

# ------ Create Models ------

## Base Line Model (Logistic Regression)

In [2]:
# store name to be use in results
model = 'LogisticRegression'

machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}"),
                                   imbalance=True)

# set up pre-processing
preprocessing = machine_learning.PreProcessing()

# perform logistic regression
# print out mean squared error and accuracy
mse, score, report = machine_learning.LogisticRegression()

print(f"The mean squared error is {mse} and accuracy is {report['accuracy']} and precision is {report['1']['precision']} and \
        recall is {report['1']['recall']} and f1 is {report['1']['f1-score']}")

The mean squared error is 0.08644198610753795 and accuracy is 0.913558013892462 and precision is 0.9388312397596942 and         recall is 0.8847143592382913 and f1 is 0.9109697933227344


## Random Forest Classifier Tuning

In [3]:
# store name to be use in results
model = 'RandomForest'

machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}"),
                                   imbalance=True)

# set up pre-processing
preprocessing = machine_learning.PreProcessing()

# store results as a list of tuples
results = []

# loop through each paramter and parameter value
for key, values in random_tune.items():
    # loop through each 
    for value in values:
        
        # set parameter and value to tune
        parameter = { key : value }
        
        # perform random forest
        mse, score, report = machine_learning.RandomForest(parameter_dict=parameter, regressor=False)
        
        results.append((key, value, mse, report['accuracy'], report['1']['precision'], 
                        report['1']['recall'], report['1']['f1-score']))

### Show best results

In [4]:
# store best result to be used as parameter
best_results_dict = {}

# loop through all parameter keys
for key in random_tune.keys():
    
    # get the values for current key
    values = [val for val in results if val[0] == key]
    
    # get the score for current key
    scores = [val[6] for val in values]
    
    # get parameter for current key and max score
    best_score = [val for val in values if val[6] == max(scores)]
    
    # set diction with best parameter and parameter value
    best_results_dict[best_score[0][0]] = best_score[0][1]
    
# print best result
print(best_results_dict)
    
# write results to file
file_name = f"{model}_best_results_dict.json"
json_data = json.dumps(best_results_dict)
SetDict(file_name,json_data)

{'max_depth': 50, 'n_estimators': 20, 'max_features': 0.2, 'min_samples_leaf': 1}


### Final, Tuned, Random Forest

In [5]:
# store name to be use in results
model = 'RandomForest'

# call machine learning class
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}"),
                                   imbalance=True)
# setup pre processing
preprocessing = machine_learning.PreProcessing()

# get best_results_dict
best_results_dict = GetDict(f"{model}_best_results_dict.json")  

# perform random forest using best parameters
mse, score, report = machine_learning.RandomForest(regressor=False, parameter_dict=best_results_dict)

# print final model score
print(f"The mean squared error is {mse} and accuracy is {report['accuracy']} and precision is {report['1']['precision']} and \
        recall is {report['1']['precision']} and f1 is {report['1']['f1-score']}")

The mean squared error is 0.07435039876511448 and accuracy is 0.9256496012348855 and precision is 0.9475108225108225 and         recall is 0.9475108225108225 and f1 is 0.9237668161434978


## XgBoost Tuning

In [6]:
# set model name
model = 'XGBoost'

# call machine learning class
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}"),
                                   imbalance=True)

# perform pre-processing
preprocessing = machine_learning.PreProcessing()

# store results as a list of tuples
results = []

# loop through each paramter and parameter value
for key, values in xgb_tune.items():
    # loop through each
    for value in values:

        # set parameter and value to tune
        parameter = {key: value}

        # perform random forst
        mse, score, report = machine_learning.XGboost(regressor=False, parameter_dict=parameter)

        results.append((key, value, mse, report['accuracy'], report['1']['precision'], 
                        report['1']['recall'], report['1']['f1-score']))

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




### Show best results

In [7]:
# store best result to be used as parameter
best_results_dict = {}

# loop through all parameter keys
for key in xgb_tune.keys():
    
    # get the values for current key
    values = [val for val in results if val[0] == key]
    
    # get the score for current key
    scores = [val[6] for val in values]
    
    # get parameter for current key and max score
    best_score = [val for val in values if val[6] == max(scores)]
    
    # set diction with best parameter and parameter value
    best_results_dict[best_score[0][0]] = best_score[0][1]
    
# print best result
print(best_results_dict)
    
# write results to file
file_name = f"{model}_best_results_dict.json"
json_data = json.dumps(best_results_dict)
SetDict(file_name,json_data)

{'learning_rate': 0.6, 'max_depth': 10, 'n_estimators': 100}


### Final, Tuned, XgBoost

In [8]:
# set model name
model = 'XGBoost' 

# call machine learning class
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}"),
                                   imbalance=True)
 
# setup pre processing
preprocessing = machine_learning.PreProcessing()

# get best_results_dict
best_results_dict = GetDict(f"{model}_best_results_dict.json")


# # perform random forest using best parameters
mse, score, report = machine_learning.XGboost(regressor=False, parameter_dict=best_results_dict)
        
# # print final model score
print(f"The mean squared error is {mse} and accuracy is {report['accuracy']} and precision is {report['1']['precision']} and \
        recall is {report['1']['precision']} and f1 is {report['1']['f1-score']}")

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


The mean squared error is 0.0720349884229483 and accuracy is 0.9279650115770517 and precision is 0.940646528881823 and         recall is 0.940646528881823 and f1 is 0.926892950391645


## LgBoost Tuning

In [9]:
# set model name
model = 'LGBoost'

# call machine learning class
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}"),
                                   imbalance=True)
 
# set columns for preprocessing
preprocessing = machine_learning.PreProcessing()

# store results as a list of tuples
results = []

# loop through each paramter and parameter value
for key, values in lgb_tune.items():
    # loop through each 
    for value in values:
        
        # set parameter and value to tune
        parameter = { key : value }
        
        # perform random forst
        mse, score, report = machine_learning.LGboost(regressor=False, parameter_dict=parameter)
        
        results.append((key, value, mse, report['accuracy'], report['1']['precision'], 
                        report['1']['recall'], report['1']['f1-score']))

### Show best results

In [10]:
# store best result to be used as parameter
best_results_dict = {}

# loop through all parameter keys
for key in lgb_tune.keys():

    # get the values for current key
    values = [val for val in results if val[0] == key]
    
    # get the score for current key
    scores = [val[6] for val in values]
    
    # get parameter for current key and max score
    best_score = [val for val in values if val[6] == max(scores)]
    
    # set diction with best parameter and parameter value
    best_results_dict[best_score[0][0]] = best_score[0][1]

# print out results
print(best_results_dict)

# write results to file
file_name = f"{model}_best_results_dict.json"
json_data = json.dumps(best_results_dict)
SetDict(file_name,json_data)

{'learning_rate': 0.2, 'max_depth': 15, 'num_leaves': 31, 'n_estimators': 100, 'min_data_in_leaf': 100}


### Final, Tuned, LgBoost

In [None]:
# set model name
model = 'LGBoost' 

# call machine learning class
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}"),
                                   imbalance=True)
 
# setup pre processing
preprocessing = machine_learning.PreProcessing()

# get best_results_dict
best_results_dict = GetDict(f"{model}_best_results_dict.json")  


# perform random forest using best parameters
mse, score, report = machine_learning.LGboost(regressor=False, parameter_dict=best_results_dict)
        
# print final model score
print(f"The mean squared error is {mse} and accuracy is {report['accuracy']} and precision is {report['1']['precision']} and \
        recall is {report['1']['precision']} and f1 is {report['1']['f1-score']}")

# ------ Test Models ------

## Logistic Regression Cross Validation

In [None]:
# store name to be use in results
model = 'LogisticRegression'

# store results for all models
cv_results = {}

# load data
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}_CrossValidation"),
                                   imbalance=True)

# setup pre processing
preprocessing=machine_learning.PreProcessing()

# perform random forest using best parameters
mse = machine_learning.LogisticRegression(cross_validation=True)

# get max f1 score index
index = np.argmax(mse[4])

# get minimum mse and corresponding accuracy
min_mse = mse[0][index]
accuracy = mse[1][index]
precision = mse[2][index]
recall = mse[3][index]
f1 = mse[4][index]

# store final model score
cv_results[model] = min_mse, accuracy, precision, recall, f1

## Random Forest Cross Validation

In [None]:
# store name to be use in results
model = 'RandomForest'

# load data
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}_CrossValidation"),
                                   imbalance=True)

# setup pre processing
preprocessing = machine_learning.PreProcessing()

# get best_results_dict
best_results_dict = GetDict(f"{model}_best_results_dict.json")  

# perform random forest using best parameters
mse = machine_learning.RandomForest(
    parameter_dict=best_results_dict, regressor=False, cross_validation=True)

# get max f1 score index
index = np.argmax(mse[4])

# get minimum mse and corresponding accuracy
min_mse = mse[0][index]
accuracy = mse[1][index]
precision = mse[2][index]
recall = mse[3][index]
f1 = mse[4][index]

# store final model score
cv_results[model] = min_mse, accuracy, precision, recall, f1

## XgBoost Forest Cross Validation

In [None]:
# store name to be use in results
model = 'XGBoost'

# load data
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}_CrossValidation"),
                                   imbalance=True)

# setup pre processing
preprocessing = machine_learning.PreProcessing()

# get best_results_dict
best_results_dict = GetDict(f"{model}_best_results_dict.json")  

# perform random forest using best parameters
mse = machine_learning.XGboost(
    parameter_dict=best_results_dict, regressor=False, cross_validation=True)

# get max f1 score index
index = np.argmax(mse[4])

# get minimum mse and corresponding accuracy
min_mse = mse[0][index]
accuracy = mse[1][index]
precision = mse[2][index]
recall = mse[3][index]
f1 = mse[4][index]

# store final model score
cv_results[model] = min_mse, accuracy, precision, recall, f1

## LgBoost Forest Cross Validation

In [None]:
# store name to be use in results
model = 'LGBoost'

# load data
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}_CrossValidation"),
                                   imbalance=True)

# setup pre processing
preprocessing = machine_learning.PreProcessing()

# get best_results_dict
best_results_dict = GetDict(f"{model}_best_results_dict.json")  

# perform random forest using best parameters
mse = machine_learning.LGboost(
    parameter_dict=best_results_dict, regressor=False, cross_validation=True)

# get max f1 score index
index = np.argmax(mse[4])

# get minimum mse and corresponding accuracy
min_mse = mse[0][index]
accuracy = mse[1][index]
precision = mse[2][index]
recall = mse[3][index]
f1 = mse[4][index]

# store final model score
cv_results[model] = min_mse, accuracy, precision, recall, f1

## Show model results

In [None]:
# show model results
model_dataframe = pd.DataFrame()
model_dataframe['Model'] = cv_results.keys()
model_dataframe['MeanSquaredError'] = [v[0] for v in cv_results.values()]
model_dataframe['Accuracy'] = [v[1] for v in cv_results.values()]
model_dataframe['Precision'] = [v[2] for v in cv_results.values()]
model_dataframe['Recall'] = [v[3] for v in cv_results.values()]
model_dataframe['F1'] = [v[4] for v in cv_results.values()]
model_dataframe

## Save results to file

In [None]:
# write dataframe to file
model_dataframe.to_csv(os.path.join('../logs','FinalResults.csv'), index=False)

## Select best model

In [None]:
#select the model with the lowest error as your "production" model
model_dataframe = pd.read_csv(os.path.join('../logs', 'FinalResults.csv'), low_memory=False)
best_model = model_dataframe.loc[model_dataframe['F1'] == max(model_dataframe['F1'])]
best_model

## Save Final Model

In [None]:
# store name to be use in results
model = best_model['Model'].to_string(index=False).strip()

# load data
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}_FinalModel"),
                                   imbalance=True)

# setup pre processing
preprocessing = machine_learning.PreProcessing()

# choose the best model and save the model
if model == 'LogisticRegression':
    machine_learning.LogisticRegression(save_model=True)
elif model == 'RandomForest':
    best_results_dict = GetDict(f"{model}_best_results_dict.json")  
    machine_learning.RandomForest(
        parameter_dict=best_results_dict, regressor=False, save_model=True)
elif model == 'XGBoost':
    best_results_dict = GetDict(f"{model}_best_results_dict.json")  
    machine_learning.XGboost(parameter_dict=best_results_dict, regressor=False, save_model=True)
elif model == 'LGBoost':
    best_results_dict = GetDict(f"{model}_best_results_dict.json")  
    machine_learning.LGboost(parameter_dict=best_results_dict, regressor=False, save_model=True)
else:
    print(f'{model} not found')

## Feature Importance for Best Model

In [None]:
# store name to be use in results
model_name = best_model['Model'].to_string(index=False).strip()

# plot feature importance for the best model
FeatureImportance(model=model_name);
