In [970]:
import warnings
import numpy as np
import pandas as pd
from IPython.display import display
from IPython.display import display_markdown
import joblib
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression, HuberRegressor
from sklearn.model_selection import StratifiedKFold

In [971]:
# Project Configurations
class Configs:
    def __init__(self, debug=False) -> None:
        self.path_prefix = "./tabular-playground-series-aug-2022"
        self.train_path = "./tabular-playground-series-aug-2022/train.csv"
        self.test_path = "./tabular-playground-series-aug-2022/test.csv"
        self.store_path = "./model"
        self.submission_path = "./tabular-playground-series-aug-2022/sample_submission.csv"
        self.is_debug = debug

In [972]:
"""
Used for data preprocessing
"""
def corr_selection(data, test):
    # Select columns with correlation and apply imputation
    # Means to select best correlation columns according to the product code
    full_dict = {}
    num_get = 7 # get top 7 columns
    # Append additional attributes
    col = [col for col in test.columns if 'measurement' not in col] \
        + ['loading', 'm3_missing', 'm5_missing', 'count_null', 'area']
    correlations = []
    selected =[]

    # imputation for measurement3 ~ 17 (start from missing)
    for m_num in range(3,18):
        correlation = data.drop(col, axis=1).corr()[f'measurement_{m_num}']
        correlation = np.absolute(correlation).sort_values(ascending=False)
        # selection with correlation sum
        correlations.append(np.round(np.sum(correlation[1:5]), 5))
        selected.append(f'measurement_{m_num}')
        
    # select columns
    selected_col = pd.DataFrame()
    selected_col['selected'] = selected
    selected_col['correlation'] = correlations
    selected_col = selected_col.sort_values(by='correlation',ascending=False).reset_index(drop=True)

    # select columns to apply imputation
    # much unused, only use the first 3 imputation
    # some attrs (like count null) may be affected by imputation, do imputation for those datas
    for i in range(num_get):
        target_cols = 'measurement_' + selected_col.iloc[i,0][12:] # selection for next best
        fill_dict ={}
        for x in data.product_code.unique() : 
            # Compute correlation for all product code
            correlation = data[data.product_code == x].drop(col, axis=1).corr()[target_cols]
            correlation = np.absolute(correlation).sort_values(ascending=False)
            target_cols_dic = {}
            target_cols_dic[target_cols] = correlation[1:5].index.tolist()
            fill_dict[x] = target_cols_dic[target_cols]
        full_dict[target_cols] = fill_dict

    # display selected columns
    display_markdown('## Selected columns by the sum of correlation', raw=True)
    display(selected_col.head(10))
    
    return full_dict

In [973]:
"""
Program initialization
1. filtering warnings
2. create global configuration
3. read data indicated by configuration
4. describe the data if 'debug' mode is opened in configs
"""
warnings.filterwarnings('ignore')            # Ignore warnings with filter
configs = Configs(False)                     # Global Configuration for easy accessing

# Read Data
train = pd.read_csv(f"{configs.train_path}")
test = pd.read_csv(f"{configs.test_path}")
submission = pd.read_csv(f"{configs.submission_path}")

# Dataset description
if configs.is_debug:
    print(f"Data Shape: [\n\
        train: {train.shape}, \n\
        test: {test.shape}\n\
    ]")
    print(f"Label in train dataset: [ \n\
        failure: {train[train.failure==0].shape[0]}, \n\
        success: {train[train.failure==1].shape[0]}\n\
    ]") # get corresponding row

In [974]:
"""
Data-Preprocessing
1. create or aggregate features to get new features
2. make imputation for missing values with their most correlation attributes
"""
def data_preprocessing(train, test):
    # Aggregate orb create new values by discussions mentioned in report
    display_markdown('# Imputation', raw=True)
    data = pd.concat([train, test])
    data['area'] = data['attribute_2'] * data['attribute_3']
    data['count_null'] = data.isnull().sum(axis=1)
    data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
    data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
    data['loading'] = np.log1p(data['loading'])

    # locate for all measurement
    features = [f for f in test.columns if f.startswith('measurement') or f=='loading']

    # Two filling method
    # 1. fill with linear model (LinearRegression)
    # 2. fill with KNN model
    full_dict = corr_selection(data, test)
    display_markdown('## Imputation with Linear Model and KNN Model', raw=True)
    for code in data.product_code.unique():
        # Fill with LinearRegression
        for m_col in list(full_dict.keys()):
            corress_data = data[data.product_code==code]
            column = full_dict[m_col][code]
            tmp_train = corress_data[column + [m_col]].dropna(how='any')
            tmp_test = corress_data[
                (corress_data[column].isnull().sum(axis=1)==0) & (corress_data[m_col].isnull())
            ]
            huber_regressor = HuberRegressor(epsilon=1.35, max_iter = 1000)
            huber_regressor.fit(tmp_train[column], tmp_train[m_col])
            joblib.dump(huber_regressor, f"{configs.store_path}/Huber_{code}_{m_col}") # dump for inference
            data.loc[(data.product_code==code)                  # Imputation on selected missing fields
                     & (data[column].isnull().sum(axis=1)==0)
                     & (data[m_col].isnull()), m_col] = huber_regressor.predict(tmp_test[column])
            if configs.is_debug:
                print(f"{m_col}: code {code}, {len(tmp_test)} samples")
        print(f"code {code} have been imputed by LinearRegressor")
        # Fill with KNN
        imputer = KNNImputer(n_neighbors=5)
        joblib.dump(imputer, f"{configs.store_path}/imputer_{code}")
        data.loc[data.product_code==code, features] = imputer.fit_transform(data.loc[data.product_code==code, features])
        print(f"code {code} have been imputed by KNN")
    print("Done Inputation")
    return data

In [975]:
# Use function
data = data_preprocessing(train, test)

# Imputation

## Selected columns by the sum of correlation

Unnamed: 0,selected,correlation
0,measurement_17,1.4314
1,measurement_8,0.46499
2,measurement_11,0.41957
3,measurement_5,0.39309
4,measurement_6,0.37335
5,measurement_7,0.34639
6,measurement_4,0.33899
7,measurement_16,0.30221
8,measurement_10,0.28251
9,measurement_14,0.25259


## Imputation with Linear Model and KNN Model

code A have been imputed by LinearRegressor
code A have been imputed by KNN
code B have been imputed by LinearRegressor
code B have been imputed by KNN
code C have been imputed by LinearRegressor
code C have been imputed by KNN
code D have been imputed by LinearRegressor
code D have been imputed by KNN
code E have been imputed by LinearRegressor
code E have been imputed by KNN
code F have been imputed by LinearRegressor
code F have been imputed by KNN
code G have been imputed by LinearRegressor
code G have been imputed by KNN
code H have been imputed by LinearRegressor
code H have been imputed by KNN
code I have been imputed by LinearRegressor
code I have been imputed by KNN
Done Inputation


In [976]:
"""
Linear Regression Model
0. Use KFold for input training data
1. train multiple linear regressor preventing overfitting
2. evaluation the linear regressor by auc and acc
3. return results of the test data
"""
class LRModel:
    def __init__(self, data, features):
        self.x = data.drop(['failure'], axis=1)
        self.y = data['failure'].astype(int)
        self.features = features
        self.n_splits = 5 # KFold Spilt
        self.data_len = len(data)
        self.lr_auc, self.lr_preds = 0, 0
        # evaluation of model
        self.x_trains = []
        self.train_aucs = []
        self.train_accs = []
        self.oof_aucs = []
        self.oof_accs = []
        self.feats_evaluations = []
        self.lr_effects = [0.3, 0.3, 0.2, 0.2]
        self.lr_results = []        # prediction results
        # construct feats subset to train multiple LR
        self.subsets = []
        self._construct_subset()
        
    def _construct_subset(self): 
        # fixed chosen subset for better results
        self.subsets.append(self.features)
        tmp_list = [0, 1, 5, 6, 7]
        self.subsets.append([self.features[x] for x in tmp_list])
        tmp_list = [0, 2, 3, 4, 6, 7]
        self.subsets.append([self.features[x] for x in tmp_list])
        tmp_list = [0, 5, 7]
        self.subsets.append([self.features[x] for x in tmp_list])
        
    def _get_accuracy_n(self, valid, preds, n_splits):
        return round(accuracy_score(valid, preds), n_splits)
    
    def _get_auc_n(self, valid, preds, n_splits):
        return round(roc_auc_score(valid, preds), n_splits)

    def _scale(self, train_data, val_data, test_data, feats):
        # function for cross validation, scaling the data of x_train, x_val, x_test by selected feature
        scaler = StandardScaler()                                 # Gaussion distribussion
        # transform datas with specific feats
        scaled_train = scaler.fit_transform(train_data[feats])  
        scaled_val = scaler.transform(val_data[feats])
        scaled_test = scaler.transform(test_data[feats])
        # make return value
        rtn_train = train_data.copy()
        rtn_val = val_data.copy()
        rtn_test = test_data.copy()
        # replace specific feats with standardize datas
        rtn_train[feats] = scaled_train
        rtn_val[feats] = scaled_val
        rtn_test[feats] = scaled_test
        return rtn_train, rtn_val, rtn_test
    
    def train(self, test_data):
        # train multiple LRs
        for i_feat, feats in enumerate(self.subsets):
            lr_auc, lr_preds = 0, 0
            lr_target = np.zeros(len(test_data))
            lr_oof_auc, lr_oof_preds = np.zeros(self.data_len), np.zeros(self.data_len)
            feats_evaluation = []
    
            # KFold
            kfold = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=0)
            for _, (train_idx, valid_idx) in enumerate(kfold.split(self.x, self.y)):
                x_train, x_val = self.x.iloc[train_idx], self.x.iloc[valid_idx]
                y_train, y_val = self.y.iloc[train_idx], self.y.iloc[valid_idx]
                x_test = test_data.copy()
                x_train, x_val, x_test = self._scale(x_train, x_val, x_test, feats)
                # train linear regressor
                lr = LogisticRegression(max_iter=1000, C=0.0001, penalty='l2', solver='newton-cg')
                lr.fit(x_train[feats], y_train)
                joblib.dump(lr, f"{configs.store_path}/LRModel_{i_feat}_{_}")  # dump model
                val_preds = lr.predict_proba(x_val[feats])[:, 1]
                y_preds = lr.predict(x_val[feats])
                lr_target += lr.predict_proba(x_test[feats])[:, 1] / self.n_splits
                
                # Store constants for evaluation
                feats_evaluation.append(lr.coef_.ravel())      # evaluate importance of features
                lr_auc += roc_auc_score(y_val, val_preds) / self.n_splits
                lr_preds += accuracy_score(y_val, y_preds) / self.n_splits
                lr_oof_auc[valid_idx] = val_preds
                lr_oof_preds[valid_idx] = y_preds
            
            # Store result and evaluations
            self.x_trains.append(x_train)
            self.lr_results.append(lr_target)
            self.train_aucs.append(round(lr_auc, self.n_splits))
            self.train_accs.append(round(lr_preds, self.n_splits))
            self.oof_aucs.append(self._get_auc_n(self.y, lr_oof_auc, self.n_splits))
            self.oof_accs.append(self._get_accuracy_n(self.y, lr_oof_preds, self.n_splits))
            self.feats_evaluations.append(feats_evaluation)
            
    def evaluation(self):
        # print evaluation information
        for i, feats in enumerate(self.subsets):
            display_markdown(f"## Model Evaluation for lr{i}", raw=True)
            display_markdown(f"- Train auc = {self.train_aucs[i]} \
                \n- Train acc = {self.train_accs[i]}", raw=True)
            display_markdown(f"- oof auc = {self.oof_aucs[i]} \
                \n- oof acc = {self.oof_accs[i]}", raw=True)
            
            display_markdown("### Importance List", raw=True)
            importancement = pd.DataFrame(
                np.array(self.feats_evaluations[i]).T,
                index=self.x_trains[i][feats].columns
            ).mean(axis=1).abs().sort_values(ascending=False)
            display(importancement.head())

    def get_results(self):
        return self.lr_results

In [977]:
"""
Training with defined LRModel
1. get train and test dataset ready
2. select features (according to some discussions mentioned in report)
3. init the training model and predict
4. get returned results
5. save to .csv file
"""
# reassign train and test data since concat before
# test -> need prediction the failure, choose failure field is empty
train = data[data.failure.notnull()]
test = data[data.failure.isnull()].drop(['failure'], axis=1)

# select features
select_feature = [
    'loading', 
    'area', 'count_null',
    'm3_missing', 'm5_missing', 
    'measurement_1', 'measurement_2', 'measurement_17'
]

# Init and train model
lrmodel = LRModel(train, select_feature)
lrmodel.train(test_data=test)
lrmodel.evaluation()

# get results
submission['lr0'], submission['lr1'], submission['lr2'], submission['lr3'] = lrmodel.get_results()
submission['failure'] = submission['lr0'] * lrmodel.lr_effects[0] + \
    submission['lr1'] * lrmodel.lr_effects[1] + \
    submission['lr2'] * lrmodel.lr_effects[2] +\
    submission['lr3'] * lrmodel.lr_effects[2]
submission.head()

# write to csv
submission[['id', 'failure']].to_csv('109550129_submission.csv', index=False)

## Model Evaluation for lr0

- Train auc = 0.59124                 
- Train acc = 0.78739

- oof auc = 0.59114                 
- oof acc = 0.78739

### Importance List

loading           0.081209
measurement_17    0.021065
measurement_2     0.010496
m5_missing        0.010240
m3_missing        0.010068
dtype: float64

## Model Evaluation for lr1

- Train auc = 0.59074                 
- Train acc = 0.78739

- oof auc = 0.59065                 
- oof acc = 0.78739

### Importance List

loading           0.081218
measurement_17    0.021076
measurement_2     0.010483
area              0.010008
measurement_1     0.006987
dtype: float64

## Model Evaluation for lr2

- Train auc = 0.59066                 
- Train acc = 0.78739

- oof auc = 0.59059                 
- oof acc = 0.78739

### Importance List

loading           0.081226
measurement_17    0.021070
m5_missing        0.010264
measurement_2     0.010225
m3_missing        0.010105
dtype: float64

## Model Evaluation for lr3

- Train auc = 0.59021                 
- Train acc = 0.78739

- oof auc = 0.59019                 
- oof acc = 0.78739

### Importance List

loading           0.081239
measurement_17    0.021045
measurement_1     0.006989
dtype: float64