In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import random as r
import seaborn as sns
import matplotlib.pyplot as plt

from time import time
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, classification_report

In [2]:
class CFG(object):
    def __init__(self,
         seed: int = 42,
         n_splits: int = 5,
         show_info: bool = False,
         ):

        self.seed = seed
        self.n_splits = n_splits
        self.show_info = show_info
        self.tr_data = "/kaggle/input/tams-ais-winter-2022-competition/train_data.csv"
        self.ts_data = "/kaggle/input/tams-ais-winter-2022-competition/test_data.csv"
        self.ss_data = "/kaggle/input/tams-ais-winter-2022-competition/sample.csv"
        self.model_save_path = "models"
        if not os.path.exists(self.model_save_path): os.makedirs(self.model_save_path)

cfg = CFG(seed=42, show_info=True)

In [3]:
def breaker(num: int=50, char: str="*") -> None:
    print("\n" + num*char + "\n")
    

def print_scores(accuracy: float, auc: float, precision: np.ndarray, recall: np.ndarray, f_score: np.ndarray) -> None:
    print(f"Accuracy  : {accuracy:.5f}")
    print(f"ROC-AUC   : {auc:.5f}")
    print(f"Precision : {precision}")
    print(f"Recall    : {recall}")
    print(f"F-Score   : {f_score}")
    

def get_scores(y_true: np.ndarray, y_pred: np.ndarray) -> tuple:
    accuracy = accuracy_score(y_pred, y_true)
    try:
        auc = roc_auc_score(y_pred, y_true)
    except: 
        auc = 0.5
    precision, recall, f_score, _ = precision_recall_fscore_support(y_pred, y_true)

    return accuracy, auc, precision, recall, f_score

In [4]:
class Model(object):
    def __init__(self, model_name: str, preprocessor, seed: int):
        self.model_name = model_name

        if self.model_name == "lgr":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", LogisticRegression(random_state=seed)),
                ]
            )
        
        elif self.model_name == "knc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", KNeighborsClassifier()),
                ]
            )
        
        elif self.model_name == "svc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", SVC()),
                ]
            )
        
        elif self.model_name == "dtc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", DecisionTreeClassifier(random_state=seed)),
                ]
            )

        elif self.model_name == "etc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", ExtraTreeClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "rfc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", RandomForestClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "gbc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", GradientBoostingClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "abc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", AdaBoostClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "etcs":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", ExtraTreesClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "gnb":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", GaussianNB()),
                ]
            )
        
        elif self.model_name == "xgbc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", XGBClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "lgbmc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", LGBMClassifier(random_state=seed)),
                ]
            )

In [5]:
names: list = [
    "lgr", 
    "knc", 
    "svc", 
    "dtc", 
    "etc", 
    "rfc", 
    "gbc", 
    "abc", 
    "etcs", 
    "gnb", 
    "xgbc", 
    "lgbmc"
]
# names = names[:2]

tr_df = pd.read_csv(cfg.tr_data)
tr_df = tr_df.drop(columns=["Serial No."])

X = tr_df.iloc[:, :-1].copy().values
y = tr_df.iloc[:, -1].copy().values
y[y > 0.5] = 1
y[y <= 0.5] = 0
y = y.astype("uint8")

if cfg.show_info:
    breaker()
    for val in set(y):
        print(f"Class {val} count : {y[y == val].shape[0]}")

features = [i for i in range(X.shape[1])]

feature_transformer = Pipeline(
    steps=[
        ("Standard_Scaler", StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("features", feature_transformer, features),
    ]
)

best_auc = 0.0
for name in names:
    fold = 1
    if cfg.show_info: breaker()
    for tr_idx, va_idx in StratifiedKFold(n_splits=cfg.n_splits, random_state=cfg.seed, shuffle=True).split(X, y):
        X_train, X_valid, y_train, y_valid = X[tr_idx], X[va_idx], y[tr_idx], y[va_idx]
        my_pipeline = Model(name, preprocessor, cfg.seed)
        my_pipeline.model.fit(X_train, y_train)

        y_pred = my_pipeline.model.predict(X_valid)
        if cfg.show_info:
            acc, auc, pre, rec, f1 = get_scores(y_valid, y_pred)
            print(f"{my_pipeline.model_name}, {fold}\n")
            print_scores(acc, auc, pre, rec, f1)
            print("")
        else:
            _, auc, _, _, _ = get_scores(y_valid, y_pred)

        if auc > best_auc:
            best_auc = auc
            model_fold_name = f"{name}_{fold}"

            with open(os.path.join(cfg.model_save_path, f"best_model.pkl"), "wb") as fp: pickle.dump(my_pipeline.model, fp)
        
        with open(os.path.join(cfg.model_save_path, f"{name}_{fold}.pkl"), "wb") as fp: pickle.dump(my_pipeline.model, fp)
        
        fold += 1
    

if cfg.show_info: 
    breaker()
    print(f"Best Model : {model_fold_name.split('_')[0]}, Best Fold : {model_fold_name.split('_')[1]}")

breaker()


**************************************************

Class 0 count : 24
Class 1 count : 296

**************************************************

lgr, 1

Accuracy  : 0.96875
ROC-AUC   : 0.86667
Precision : [0.75       0.98333333]
Recall    : [0.75       0.98333333]
F-Score   : [0.75       0.98333333]

lgr, 2

Accuracy  : 0.93750
ROC-AUC   : 0.96825
Precision : [0.2 1. ]
Recall    : [1.         0.93650794]
F-Score   : [0.33333333 0.96721311]

lgr, 3

Accuracy  : 0.95312
ROC-AUC   : 0.97581
Precision : [0.4 1. ]
Recall    : [1.        0.9516129]
F-Score   : [0.57142857 0.97520661]

lgr, 4

Accuracy  : 0.92188
ROC-AUC   : 0.72500
Precision : [0.4        0.96610169]
Recall    : [0.5  0.95]
F-Score   : [0.44444444 0.95798319]

lgr, 5

Accuracy  : 0.93750
ROC-AUC   : 0.96825
Precision : [0.2 1. ]
Recall    : [1.         0.93650794]
F-Score   : [0.33333333 0.96721311]


**************************************************

knc, 1

Accuracy  : 0.95312
ROC-AUC   : 0.97619
Precision : [0.25 1.  ]
R

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


rfc, 1

Accuracy  : 0.98438
ROC-AUC   : 0.99180
Precision : [0.75 1.  ]
Recall    : [1.         0.98360656]
F-Score   : [0.85714286 0.99173554]

rfc, 2

Accuracy  : 0.93750
ROC-AUC   : 0.96825
Precision : [0.2 1. ]
Recall    : [1.         0.93650794]
F-Score   : [0.33333333 0.96721311]

rfc, 3

Accuracy  : 0.92188
ROC-AUC   : 0.71774
Precision : [0.2        0.98305085]
Recall    : [0.5        0.93548387]
F-Score   : [0.28571429 0.95867769]

rfc, 4

Accuracy  : 0.92188
ROC-AUC   : 0.71774
Precision : [0.2        0.98305085]
Recall    : [0.5        0.93548387]
F-Score   : [0.28571429 0.95867769]

rfc, 5

Accuracy  : 0.90625
ROC-AUC   : 0.46032
Precision : [0.         0.98305085]
Recall    : [0.         0.92063492]
F-Score   : [0.         0.95081967]


**************************************************

gbc, 1

Accuracy  : 0.96875
ROC-AUC   : 0.86667
Precision : [0.75       0.98333333]
Recall    : [0.75       0.98333333]
F-Score   : [0.75       0.98333333]

gbc, 2

Accuracy  : 0.92188
ROC

In [6]:
ts_df = pd.read_csv(cfg.ts_data)
ts_df = ts_df.drop(columns=["Serial No."])
X_test = ts_df.copy().values

model = pickle.load(open(os.path.join(cfg.model_save_path, f"best_model.pkl"), "rb"))
y_pred = model.predict_proba(X_test)[:, 1]

ss_df = pd.read_csv(cfg.ss_data)
ss_df["Chance of Admit"] = y_pred
ss_df.to_csv("submission.csv", index=False)

In [7]:
ss_df.head(5)

Unnamed: 0,Serial No.,Chance of Admit
0,1,1.0
1,2,0.99
2,3,0.97
3,4,0.85
4,5,0.99
