## import packages

In [None]:
import sys
import time
from pathlib import Path
import os

import pandas as pd
import numpy as np

from sklearn.preprocessing import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
import seaborn as sns

import pickle

## set up for imports of .py modules by adding path to sys.path

In [None]:
path = Path(os.getcwd())
path = str(path)
print(path)
sys.path.insert(1, path)

## import python modules

In [None]:
import utils.regression_utils as reg_utils
import utils.sml_utils as sml_utils
import utils.bin_class_utils as class_utils
import utils.assign_and_lab_utils as al_utils
import utils.assign_3_utils as assign_3_utils
import utils.multi_colinearity_utils as mc_utils

## helpful functions

## parameters

In [None]:
with_milti_colinearity = True
if with_milti_colinearity:
    data_set = 'data_set_1'
    path_to_data = 'data/synth_2_class_10000_0_55_0_45_15_6_4_1_4_0_0_0_5_w_noms.csv'
    path_to_col_idx_shuffle_map = 'data/synth_2_class_10000_0_55_0_45_15_6_4_1_4_0_0_0_5_w_noms_col_idx_shuffle_map.pkl'
else:
    data_set = 'data_set_2'
    path_to_data = 'data/synth_2_class_10000_0_55_0_45_15_6_0_0_9_0_0_0_5_w_noms.csv'
    path_to_col_idx_shuffle_map = 'data/synth_2_class_10000_0_55_0_45_15_6_0_0_9_0_0_0_5_w_noms_col_idx_shuffle_map.pkl'

# step 1 parameters - check for missingness in target vector
target_attr = 'y'

# step 2 parameters - train/test split
test_size = 0.20
train_test_split_random_state = 42

# step 3 parameters - train/validation split

# step 5 parameters - identify attributes with missingness above threshold
missingness_threshold = 0.20

# step 9 parameters - build a composite estimator
target_encoder_random_state = 42

# model selection parameters
if with_milti_colinearity:
    num_std = 1.0  # number of mse standard deviations to give up for a reduction in variables
else:
    num_std = 1.0  # number of mse standard deviations to give up for a reduction in variables

model_type = 'LogisticRegressionCV'  # model specific cv from sklearn
cv_folds = 5  # number of folds for model specific cv
scoring='neg_log_loss'
max_iter=100
class_weight=None  # changing this could impact regularization

penalty='l1'  # Cs describes the inverse of regularization strength

if penalty == 'l1':
    
    l1_ratio_list = None
    solver = 'saga'  # solver can be {‘liblinear’, ‘saga’}
    cap_c_s = np.logspace(-4, 1, 50)  # l1 - Cs describes the inverse of regularization strength
    
elif penalty == 'l2':

    l1_ratio_list = None
    solver = 'saga'  # solver can be {‘lbfgs’, ‘liblinear’, ‘newton-cg’, ‘newton-cholesky’, ‘sag’, ‘saga’}
    cap_c_s = np.logspace(-7, 1, 50)  # l2 - Cs describes the inverse of regularization strength
    
elif penalty == 'elasticnet':
    
    l1_ratio_list = [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1.0]  # values of l1 for ElasticNetCV to search over
    solver = 'saga'  # solver can be {‘saga’}
    
else:
    
    sys.exit(f'penalty {penalty} is unrecognized')

# understand nature of synthetic attributes
col_idx_shuffle_map = False

# check out attribute multi colinearity
check_out_multi_colinearity = False

## set up to time script run time

In [None]:
start = time.time()

## read in the data and get the size of the data

In [None]:
df = pd.read_csv(path_to_data)
print(df.shape)
df.head()

## out of pipeline preprocessing

### These operations cannot be completed in the scikit-learn pipeline.

### They should be identified and passed on the the data engineer as tasks conducted during extract/transform/load (ETL) if the model goes to production.

## 1. check for missingness in target vector

In [None]:
print(df.shape)
df = df.dropna(subset=target_attr)
print(df.shape)

## 2. label binarize

In [None]:
if pd.api.types.infer_dtype(df[target_attr]) == 'string':
    df, le_name_mapping = class_utils.label_binarize_binary(df, target_attr, print_results=True)
else:
    print(f'df[target_attr] is not a string attribute')

## 3. train/test split

In [None]:
train_cap_x_df, train_y_df = sml_utils.perform_the_train_test_split(df, test_size, train_test_split_random_state, val=False)

## 4. train/validation split

### We will use the k-fold cross validation to select a model - no validation set is required.

## check out the attribute types

In [None]:
train_cap_x_df.dtypes

## 5. identify attributes with  missingness above threshold

In [None]:
return_dict = sml_utils.get_missingness(train_cap_x_df, missingness_threshold)
missingness_drop_list = return_dict['missingness_drop_list']

## 6. identify non machine learning attributes

In [None]:
sml_utils.check_for_complete_unique_attrs(train_cap_x_df)

In [None]:
non_ml_attr_list = ['id']

## 7. identify attributes to exclude from machine learning

In [None]:
train_cap_x_df.columns

In [None]:
ml_attr_drop_list = []

## 8. establish machine learning attribute configuration

In [None]:
ml_ignore_list = missingness_drop_list + non_ml_attr_list + ml_attr_drop_list
ml_ignore_list

In [None]:
train_cap_x_df.columns

In [None]:
train_cap_x_df.dtypes

In [None]:
# identify the remaining numerical attributes to be used in machine learning and enter them into the 
# numerical_attr list below.

numerical_attr = []

# identify the remaining nominal attributes to be used in machine learning and enter them into the 
# nominal_attr list below.

nominal_attr = []

assert(train_cap_x_df.shape[1] == len(ml_ignore_list) + len(nominal_attr) + len(numerical_attr))  # got them all?

print(f'ml_ignore_list: {ml_ignore_list}')
print(f'\nnumerical_attr: {numerical_attr}')
print(f'nominal_attr: {nominal_attr}')

print(f'\nnumber of machine learning attributes: {len(numerical_attr) + len(nominal_attr)}')
print(f'\nnumerical_attr and nominal_attr: {numerical_attr + nominal_attr}')

## 9. assess target attribute imbalance

In [None]:
train_y_df[target_attr].unique().tolist()

In [None]:
train_y_df[target_attr].value_counts(normalize=True)

## here we deviate from the binary classification flow - we are working to find a less complex model by variable selection

In [None]:
return_dict = class_utils.model_specific_cv(
    
    cap_x_df=train_cap_x_df,
    y_df=train_y_df,
    nominal_attr=nominal_attr, 
    numerical_attr=numerical_attr, 
    
    model_type=model_type,
    
    te_random_state=target_encoder_random_state, 
    
    cap_c_s=cap_c_s, 
    cv_folds=cv_folds, 
    penalty=penalty, 
    scoring=scoring, 
    solver=solver, 
    max_iter=max_iter, 
    class_weight=class_weight, 
    l1_ratio_list=l1_ratio_list,
    
    num_std=num_std
)

preproc_cap_x_df = return_dict['preproc_cap_x_df']
model_type = return_dict['model_type']
fitted_model_cv = return_dict['fitted_model_cv']
fitted_coef_dict = return_dict['fitted_coef_dict']
new_fitted_coef_dict = return_dict['new_fitted_coef_dict']

## load the col_idx_shuffle_map to understand what types of attributes have had thier coef driven to 0 by Lasso

In [None]:
if col_idx_shuffle_map:
    
    with open(path_to_col_idx_shuffle_map, 'rb') as f:
        col_idx_shuffle_map = pickle.load(f)
    
    
    print(f'in col_idx_shuffle_map the key is the attributes data frame column index and the value is a tuple that describes the'
          f' nature of the attribute\n')
    for attr, attr_nature in col_idx_shuffle_map.items():
        print(f'attr: {attr}; attr_nature: {attr_nature}')

## check out the nature of the attributes whose coef were driven to 0

In [None]:
if col_idx_shuffle_map:
    
    attr_to_drop = []
    for attr, coef in new_fitted_coef_dict.items():
        
        if coef == 0:
            
            value = col_idx_shuffle_map[attr]
            
            print(f'attribute {attr} is a {value[2]} {value[1]} attribute with coef {new_fitted_coef_dict[attr]}')
    
            attr_to_drop.append(attr)
    
    for attr in attr_to_drop:
        del new_fitted_coef_dict[attr]

## check out the nature of the remaining attributes with non zero coef

In [None]:
if col_idx_shuffle_map:
    
    for attr, coef in new_fitted_coef_dict.items():
    
        value = col_idx_shuffle_map[attr]
        
        print(f'attribute {attr} is a {value[2]} {value[1]} attribute with coef {new_fitted_coef_dict[attr]}')

In [None]:
if col_idx_shuffle_map:
    
    df_row_dict_list = []
    for attr, coef in new_fitted_coef_dict.items():
        df_row_dict_list.append(
            {
                'attr': attr,
                'coef': coef,
                'nature': col_idx_shuffle_map[attr][1]
            }
        )
    plotting_df = pd.DataFrame(df_row_dict_list)
    plotting_df

In [None]:
if col_idx_shuffle_map:
    sns.histplot(plotting_df, x='coef', hue='nature')
    plt.grid()
    plt.show()

## check out multi colinearity

In [None]:
if check_out_multi_colinearity and col_idx_shuffle_map:
    a_num_attr_list = nominal_attr + numerical_attr
    vifs_df = mc_utils.print_vifs(preproc_cap_x_df, a_num_attr_list, vif_inspection_threshold=2, ols_large_vifs=True)
    vifs_df['nature'] = vifs_df.attribute.map(col_idx_shuffle_map)

In [None]:
if check_out_multi_colinearity and col_idx_shuffle_map:
    
    print(vifs_df)
    
    correlation_matrix = preproc_cap_x_df.corr().round(2).values

    triu_correlation_matrix = correlation_matrix[np.triu_indices(correlation_matrix.shape[0], k = 1)]

    print()
    flattened_corr_matrix = np.sort(triu_correlation_matrix.flatten())
    print()
    print(flattened_corr_matrix)
    print()
    print(flattened_corr_matrix.max())
    print()
    print(flattened_corr_matrix.min())

    print()
    fig, ax = plt.subplots(1, 1, figsize=(15, 5))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=ax)
    plt.show()

In [None]:
if check_out_multi_colinearity and col_idx_shuffle_map and data_set == 'data_set_1':

    temp_numerical_attr = numerical_attr.copy()
    temp_numerical_attr.remove('attr_10') 
    temp_numerical_attr.remove('attr_14') 
    
    return_dict = class_utils.model_specific_cv(
        
        cap_x_df=train_cap_x_df,
        y_df=train_y_df,
        nominal_attr=nominal_attr, 
        numerical_attr=temp_numerical_attr, 
        
        model_type=model_type,
        
        te_random_state=target_encoder_random_state, 
        
        cap_c_s=cap_c_s, 
        cv_folds=cv_folds, 
        penalty=penalty, 
        scoring=scoring, 
        solver=solver, 
        max_iter=max_iter, 
        class_weight=class_weight, 
        l1_ratio_list=l1_ratio_list,
        
        num_std=num_std
    )

## check out script run time

In [None]:
end = time.time()
print(f'script run time: {(end - start)/60} minutes')