In [1]:
import os
import json

# Third Party
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate, cross_val_score
from sklearn.calibration import calibration_curve
from sklearn import metrics
# import utils.credit as utils

# Bedrock
#from bedrock_client.bedrock.analyzer.model_analyzer import ModelAnalyzer
#from bedrock_client.bedrock.analyzer import ModelTypes
#from bedrock_client.bedrock.api import BedrockApi
#from bedrock_client.bedrock.metrics.service import ModelMonitoringService
#import pickle
#import logging

# ---------------------------------
# Constants
# ---------------------------------

OUTPUT_MODEL_PATH = "/artefact/model.pkl"
FEATURE_COLS_PATH = "/artefact/feature_cols.pkl"

CONFIG_FAI = {
    'SEX': {
        'privileged_attribute_values': [1],
        'privileged_group_name': 'Male',  # privileged group name corresponding to values=[1]
        'unprivileged_attribute_values': [2],
        'unprivileged_group_name': 'Female',  # unprivileged group name corresponding to values=[0]
    }
}


In [2]:
def load_dataset(filepath, target='TARGET', drop_columns=[]):
    df = pd.read_csv(filepath)
    # Extraneous columns
    df.drop(drop_columns, axis=1, inplace=True)
    original_len = len(df)
    # Ensure nothing missing
    df.dropna(how="any", axis=0, inplace=True)
    n_dropped = original_len - len(df)
    n_dropped != 0 and print(f"Warning - dropped {n_dropped} rows with NA data.")
    df = compress_df_mem(df)  # Need this to detect categorical variables for SMOTe
    y = np.array(df[target])
    df.drop(target, axis=1, inplace=True)
    return df, y


def compress_df_mem(df):
    """Compress memory usage of a dataframe"""
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            col_min = df[col].min()
            col_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if col_min > np.iinfo(np.int8).min and col_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif col_min > np.iinfo(np.int16).min and col_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif col_min > np.iinfo(np.int32).min and col_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif col_min > np.iinfo(np.int64).min and col_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if col_min > np.finfo(np.float16).min and col_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif col_min > np.finfo(np.float32).min and col_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    return df


In [3]:
abalone = pd.read_csv(os.path.join('data', 'abalone.csv'))

In [24]:
x_train, y_train = load_dataset(os.path.join('data', 'abalone_train.csv'), target = 'Type')
x_train["large_ring"] = (x_train["Rings"] > 10).astype(int)

In [29]:
x_train.columns

Index(['LongestShell', 'Diameter', 'Height', 'WholeWeight', 'ShuckedWeight',
       'VisceraWeight', 'ShellWeight', 'Rings', 'large_ring'],
      dtype='object')

In [25]:
x_train

Unnamed: 0,LongestShell,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings,large_ring
0,0.244995,0.194946,0.059998,0.094971,0.044495,0.024506,0.026001,4,0
1,0.620117,0.509766,0.180054,1.331055,0.594238,0.275879,0.387939,11,0
2,0.455078,0.344971,0.104980,0.400391,0.163940,0.075500,0.125977,8,0
3,0.479980,0.354980,0.114990,0.472412,0.206543,0.112000,0.131958,8,0
4,0.429932,0.324951,0.114990,0.386475,0.147461,0.106506,0.109985,11,0
...,...,...,...,...,...,...,...,...,...
2917,0.455078,0.350098,0.140015,0.572266,0.196533,0.132446,0.175049,10,0
2918,0.665039,0.535156,0.224976,2.183594,0.753418,0.391113,0.884766,27,1
2919,0.439941,0.350098,0.135010,0.435059,0.181519,0.083008,0.125000,12,0
2920,0.504883,0.405029,0.140015,0.875000,0.266602,0.173950,0.284912,12,0


In [6]:
y_train

array(['I', 'F', 'I', ..., 'I', 'M', 'M'], dtype=object)

In [26]:
x_test, y_test = load_dataset(os.path.join('data', 'abalone_test.csv'), target = 'Type')
x_test["large_ring"] = (x_test["Rings"] > 10).astype(int)

In [27]:
x_test

Unnamed: 0,LongestShell,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings,large_ring
0,0.649902,0.535156,0.175049,1.289062,0.609375,0.276611,0.343994,10,0
1,0.469971,0.375000,0.125000,0.561523,0.251953,0.136963,0.180054,10,0
2,0.334961,0.250000,0.075012,0.186035,0.094482,0.037994,0.044495,7,0
3,0.274902,0.199951,0.054993,0.092529,0.037994,0.020996,0.026001,4,0
4,0.555176,0.439941,0.155029,1.015625,0.493408,0.185547,0.262939,10,0
...,...,...,...,...,...,...,...,...,...
1250,0.485107,0.370117,0.140015,0.572266,0.203979,0.141479,0.175049,10,0
1251,0.445068,0.360107,0.109985,0.423584,0.182007,0.076477,0.140015,9,0
1252,0.475098,0.354980,0.135010,0.477539,0.214478,0.090027,0.143555,8,0
1253,0.580078,0.459961,0.130005,0.920898,0.356934,0.181030,0.290039,13,1


In [9]:
y_test

array(['F', 'F', 'I', ..., 'I', 'M', 'F'], dtype=object)

In [10]:
from sklearn.preprocessing import LabelEncoder
var_mod = ['F', 'M', 'I']
le = LabelEncoder()

In [11]:
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

In [12]:
y_train

array([1, 0, 1, ..., 1, 2, 2])

In [13]:
y_test

array([0, 0, 1, ..., 1, 2, 0])

In [14]:
def train_log_reg_model(X, y, seed=0, C=1, verbose=False, upsample=True):
    if upsample:
        verbose and print('upsampling...')
        categorical_features = [i for i, col in enumerate(X.columns) if X[col].dtype == 'int8']
        smote = SMOTENC(random_state=seed, categorical_features=categorical_features)
        X, y = smote.fit_resample(X, y)

    verbose and print('scaling...')
    scaling = StandardScaler()
    X = scaling.fit_transform(X)

    verbose and print('fitting...')
    verbose and print('C:', C)
    model = LogisticRegression(random_state=seed, C=C, max_iter=4000)
    model.fit(X, y)

    verbose and print('chaining pipeline...')
    pipe = Pipeline([('scaling', scaling), ('model', model)])
    verbose and print('done.')
    return pipe

In [15]:
best_regularizer = 1e-1
model = train_log_reg_model(x_train, y_train, seed=0, C=best_regularizer, upsample=False, verbose=True)

scaling...
fitting...
C: 0.1
chaining pipeline...
done.


In [16]:
test_prod = model.predict_proba(x_test)

In [17]:
test_prod

array([[0.4843277 , 0.04157547, 0.47409682],
       [0.27558337, 0.34512809, 0.37928854],
       [0.06218639, 0.78230776, 0.15550584],
       ...,
       [0.15712002, 0.61999076, 0.22288923],
       [0.4143101 , 0.15548208, 0.43020782],
       [0.41504908, 0.06078552, 0.52416541]])

In [18]:
test_pred = model.predict(x_test)

In [19]:
test_pred

array([0, 2, 1, ..., 1, 2, 2])

In [20]:
metrics.accuracy_score(y_test, test_pred)

0.5537848605577689

In [21]:
metrics.roc_auc_score(y_test, model.predict_proba(x_test), multi_class='ovr')

0.7621433922069859