# ------ Import Modules ------

In [1]:
# import necessary modules
import os
import pandas as pd
import numpy as np
import warnings
import json
from machine_learning import MachineLearning, random_tune, xgb_tune, lgb_tune, xgb, lgb
from config import GetDict, SetDict
from feature_importance import FeatureImportance
import h2o
from h2o.automl import H2OAutoML

# suppress warnings
warnings.filterwarnings('ignore')

# ------ Create Models ------

## Base Line Model (Logistic Regression)

In [2]:
# store name to be use in results
model = 'LogisticRegression'

machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}"),
                                   imbalance=True)

# set up pre-processing
preprocessing = machine_learning.PreProcessing()

# perform logistic regression
# print out mean squared error and accuracy
mse, score = machine_learning.LogisticRegression()

print(f"The mean squared error is {mse} and accuracy is {score}")

TypeError: Encoders require their input to be uniformly strings or numbers. Got ['float', 'str']

## Random Forest Classifier Tuning

In [None]:
# store name to be use in results
model = 'RandomForest'

machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}"),
                                   imbalance=True)

# set up pre-processing
preprocessing = machine_learning.PreProcessing()

# store results as a list of tuples
results = []

# loop through each paramter and parameter value
for key, values in random_tune.items():
    # loop through each 
    for value in values:
        
        # set parameter and value to tune
        parameter = { key : value }
        
        # perform random forst
        score = machine_learning.RandomForest(parameter_dict=parameter, regressor=False)
        
        results.append((key, value, score))

### Show best results

In [None]:
# store best result to be used as parameter
best_results_dict = {}

# loop through all parameter keys
for key in random_tune.keys():
    
    # get the values for current key
    values = [val for val in results if val[0] == key]
    # get the score for current key
    scores = [val[2] for val in values]
    # get parameter for current key and max score
    best_score = [val for val in values if val[2] == min(scores)]
    # set diction with best parameter and parameter value
    best_results_dict[best_score[0][0]] = best_score[0][1]
    
# print best result
print(best_results_dict)
    
# write results to file
file_name = f"{model}_best_results_dict.json"
json_data = json.dumps(best_results_dict)
SetDict(file_name,json_data)

### Final, Tuned, Random Forest

In [None]:
# store name to be use in results
model = 'RandomForest'

# call machine learning class
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}"),
                                   imbalance=True)
# setup pre processing
preprocessing = machine_learning.PreProcessing()

# get best_results_dict
best_results_dict = GetDict(f"{model}_best_results_dict.json")  

# perform random forest using best parameters
mse, score = machine_learning.RandomForest(regressor=False, parameter_dict=best_results_dict)

# print final model score
print(f"The mean squared error is {mse} and accuracy is {score}")

## XgBoost Tuning

In [None]:
# set model name
model = 'XGBoost'

# call machine learning class
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}"),
                                   imbalance=True)

# perform pre-processing
preprocessing = machine_learning.PreProcessing()

# store results as a list of tuples
results = []

# loop through each paramter and parameter value
for key, values in xgb_tune.items():
    # loop through each
    for value in values:

        # set parameter and value to tune
        parameter = {key: value}

        # perform random forst
        score = machine_learning.XGboost(regressor=False, parameter_dict=parameter)

        results.append((key, value, score))

### Show best results

In [None]:
# store best result to be used as parameter
best_results_dict = {}

# loop through all parameter keys
for key in xgb_tune.keys():

    # get the values for current key
    values = [val for val in results if val[0] == key]
    # get the score for current key
    scores = [val[2] for val in values]
    # get parameter for current key and max score
    best_score = [val for val in values if val[2] == min(scores)]
    # set diction with best parameter and parameter value
    best_results_dict[best_score[0][0]] = best_score[0][1]

# print out results
print(best_results_dict)

# write results to file
file_name = f"{model}_best_results_dict.json"
json_data = json.dumps(best_results_dict)
SetDict(file_name,json_data)

### Final, Tuned, XgBoost

In [None]:
# set model name
model = 'XGBoost' 

# call machine learning class
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}"),
                                   imbalance=True)
 
# setup pre processing
preprocessing = machine_learning.PreProcessing()

# get best_results_dict
best_results_dict = GetDict(f"{model}_best_results_dict.json")


# # perform random forest using best parameters
mse, score = machine_learning.XGboost(regressor=False, parameter_dict=best_results_dict)
        
# # print final model score
print(mse, score)

## LgBoost Tuning

In [None]:
# set model name
model = 'LGBoost'

# call machine learning class
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}"),
                                   imbalance=True)
 
# set columns for preprocessing
preprocessing = machine_learning.PreProcessing()

# store results as a list of tuples
results = []

# loop through each paramter and parameter value
for key, values in lgb_tune.items():
    # loop through each 
    for value in values:
        
        # set parameter and value to tune
        parameter = { key : value }
        
        # perform random forst
        score = machine_learning.LGboost(regressor=False, parameter_dict=parameter)
        
        results.append((key, value, score))

### Show best results

In [None]:
# store best result to be used as parameter
best_results_dict = {}

# loop through all parameter keys
for key in lgb_tune.keys():

    # get the values for current key
    values = [val for val in results if val[0] == key]
    # get the score for current key
    scores = [val[2] for val in values]
    # get parameter for current key and max score
    best_score = [val for val in values if val[2] == min(scores)]
    # set diction with best parameter and parameter value
    best_results_dict[best_score[0][0]] = best_score[0][1]

# print out results
print(best_results_dict)

# write results to file
file_name = f"{model}_best_results_dict.json"
json_data = json.dumps(best_results_dict)
SetDict(file_name,json_data)

### Final, Tuned, LgBoost

In [None]:
# set model name
model = 'LGBoost' 

# call machine learning class
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}"),
                                   imbalance=True)
 
# setup pre processing
preprocessing = machine_learning.PreProcessing()

# get best_results_dict
best_results_dict = GetDict(f"{model}_best_results_dict.json")  


# perform random forest using best parameters
score, mse = machine_learning.LGboost(regressor=False, parameter_dict=best_results_dict)
        
# print final model score
print(score, mse)

# ------ Test Models ------

## Logistic Regression Cross Validation

In [None]:
# store name to be use in results
model = 'LogisticRegression'

# store results for all models
cv_results = {}

# load data
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}_CrossValidation"),
                                   imbalance=True)

# setup pre processing
preprocessing=machine_learning.PreProcessing()

# perform random forest using best parameters
mse = machine_learning.LogisticRegression(cross_validation=True)

# get minimum mse value index
index = np.argmin(mse[0])

# get minimum mse and corresponding accuracy
min_mse = mse[0][index]
min_accuracy = mse[1][index]

# store final model score
cv_results[model] = min_mse, min_accuracy

## Random Forest Cross Validation

In [None]:
# store name to be use in results
model = 'RandomForest'

# load data
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}_CrossValidation"),
                                   imbalance=True)

# setup pre processing
preprocessing = machine_learning.PreProcessing()

# get best_results_dict
best_results_dict = GetDict(f"{model}_best_results_dict.json")  

# perform random forest using best parameters
mse = machine_learning.RandomForest(
    parameter_dict=best_results_dict, regressor=False, cross_validation=True)

# get minimum mse value index
index = np.argmin(mse[0])

# get minimum mse and corresponding accuracy
min_mse = mse[0][index]
min_accuracy = mse[1][index]

# store final model score
cv_results[model] = min_mse, min_accuracy

## XgBoost Forest Cross Validation

In [None]:
# store name to be use in results
model = 'XGBoost'

# load data
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}_CrossValidation"),
                                   imbalance=True)

# setup pre processing
preprocessing = machine_learning.PreProcessing()

# get best_results_dict
best_results_dict = GetDict(f"{model}_best_results_dict.json")  

# perform random forest using best parameters
mse = machine_learning.XGboost(
    parameter_dict=best_results_dict, regressor=False, cross_validation=True)

# get minimum mse value index
index = np.argmin(mse[0])

# get minimum mse and corresponding accuracy
min_mse = mse[0][index]
min_accuracy = mse[1][index]

# store final model score
cv_results[model] = min_mse, min_accuracy

## LgBoost Forest Cross Validation

In [None]:
# store name to be use in results
model = 'LGBoost'

# load data
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}_CrossValidation"),
                                   imbalance=True)

# setup pre processing
preprocessing = machine_learning.PreProcessing()

# get best_results_dict
best_results_dict = GetDict(f"{model}_best_results_dict.json")  

# perform random forest using best parameters
mse = machine_learning.LGboost(
    parameter_dict=best_results_dict, regressor=False, cross_validation=True)

# get minimum mse value index
index = np.argmin(mse[0])

# get minimum mse and corresponding accuracy
min_mse = mse[0][index]
min_accuracy = mse[1][index]

# store final model score
cv_results[model] = min_mse, min_accuracy

## Show model results

In [None]:
# show model results
model_dataframe = pd.DataFrame()
model_dataframe['Model'] = cv_results.keys()
model_dataframe['MeanSquaredError'] = [v[0] for v in cv_results.values()]
model_dataframe['Accuracy'] = [v[1] for v in cv_results.values()]
model_dataframe

## Save results to file

In [None]:
# write dataframe to file
model_dataframe.to_csv(os.path.join('../logs','FinalResults.csv'), index=False)

## Select best model

In [None]:
#select the model with the lowest error as your "production" model
model_dataframe = pd.read_csv(os.path.join('../logs', 'FinalResults.csv'), low_memory=False)
best_model = model_dataframe.loc[model_dataframe['MeanSquaredError'] == min(model_dataframe['MeanSquaredError'])]
best_model

## Save Final Model

In [None]:
# store name to be use in results
model = best_model['Model'].to_string(index=False).strip()

# load data
machine_learning = MachineLearning(train_data=os.path.join('../data','cleaned','procedure_data.tar.gz'), 
                                   label='UnpaidClaim',
                                   log_file=os.path.join('../logs', f"{model}_FinalModel"),
                                   imbalance=True)

# setup pre processing
preprocessing = machine_learning.PreProcessing()

if model == 'LogisticRegression':
    machine_learning.LogisticRegression(save_model=True)
elif model == 'RandomForest':
    best_results_dict = GetDict(f"{model}_best_results_dict.json")  
    machine_learning.RandomForest(
        parameter_dict=best_results_dict, regressor=False, save_model=True)
elif model == 'XGBoost':
    best_results_dict = GetDict(f"{model}_best_results_dict.json")  
    machine_learning.XGboost(parameter_dict=best_results_dict, regressor=False, save_model=True)
elif model == 'LGBoost':
    best_results_dict = GetDict(f"{model}_best_results_dict.json")  
    machine_learning.LGboost(parameter_dict=best_results_dict, regressor=False, save_model=True)
else:
    print(f'{model} not found')

## Feature Importance for Best Model

In [None]:
# store name to be use in results
model_name = best_model['Model'].to_string(index=False).strip()

# plot feature importance for the best model
FeatureImportance(model=model_name);



## h2o Auto ML

In [None]:
# # initiate h2o
# h2o.init()

# # Import claims data (same split used in above modeling) set into H2O
# train = h2o.import_file(os.path.join('../data', 'train_test_split', 'procedure_train_data.csv'))
# test = h2o.import_file(os.path.join('../data', 'train_test_split', 'procedure_test_data.csv'))

# # Identify predictors and response
# x = train.columns
# y = "UnpaidClaim"
# x.remove(y)

# # Run AutoML for 20 base models (limited to 1 hour max runtime by default)
# # Exclude Stacked Ensemble models as they will not include Feature Importance
# aml = H2OAutoML(max_models=20, seed=1, exclude_algos=['StackedEnsemble'])
# aml.train(x=x, y=y, training_frame=train)

# # View the AutoML Leaderboard
# lb = aml.leaderboard
# lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

In [None]:
# The leader model is stored here
# aml.leader