In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import random as r
import seaborn as sns
import matplotlib.pyplot as plt

from time import time
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV

from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
class CFG(object):
    def __init__(
        self,
        seed: int = 42,
        n_splits: int = 5,
        show_info: bool = False,
    ):

        self.seed = seed
        self.n_splits = n_splits
        self.show_info = show_info
        self.tr_data = "/kaggle/input/tams-ais-winter-2022-competition/train_data.csv"
        self.ts_data = "/kaggle/input/tams-ais-winter-2022-competition/test_data.csv"
        self.ss_data = "/kaggle/input/tams-ais-winter-2022-competition/sample.csv"
        self.model_save_path = "models"
        if not os.path.exists(self.model_save_path): os.makedirs(self.model_save_path)

cfg = CFG(seed=42, show_info=True)

In [3]:
def breaker(num: int=50, char: str="*") -> None:
    print("\n" + num*char + "\n")
    

def print_scores(mae: float, mse: float, r2: float) -> None:
    print(f"MSE : {mae:.5f}")
    print(f"MAE : {mse:.5f}")
    print(f"R2  : {r2:.5f}")
    

def get_scores(y_true: np.ndarray, y_pred: np.ndarray) -> tuple:
    return mean_absolute_error(y_true, y_pred), \
           mean_squared_error(y_true, y_pred), \
           r2_score(y_true, y_pred)

In [4]:
class Model(object):
    def __init__(self, model_name: str, preprocessor, seed: int):
        self.model_name = model_name

        if self.model_name == "lir":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", LinearRegression()),
                ]
            )
        
        elif self.model_name == "knr":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", KNeighborsRegressor()),
                ]
            )
        
        elif self.model_name == "svr":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", SVR()),
                ]
            )
        
        elif self.model_name == "dtr":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", DecisionTreeRegressor(random_state=seed)),
                ]
            )

        elif self.model_name == "etr":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", ExtraTreeRegressor(random_state=seed)),
                ]
            )
        
        elif self.model_name == "rfr":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", RandomForestRegressor(random_state=seed)),
                ]
            )
        
        elif self.model_name == "gbr":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", GradientBoostingRegressor(random_state=seed)),
                ]
            )
        
        elif self.model_name == "abr":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", AdaBoostRegressor(random_state=seed)),
                ]
            )
        
        elif self.model_name == "etrs":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", ExtraTreesRegressor(random_state=seed)),
                ]
            )
        
        elif self.model_name == "xgbr":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", XGBRegressor(random_state=seed)),
                ]
            )
        
        elif self.model_name == "lgbmr":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", LGBMRegressor(random_state=seed)),
                ]
            )

In [5]:
names: list = [
    "lir", 
    "knr", 
    "svr", 
    "dtr", 
    "etr", 
    "rfr", 
    "gbr", 
    "abr", 
    "etrs", 
    "xgbr", 
    "lgbmr"
]
# names = names[:2]

tr_df = pd.read_csv(cfg.tr_data)
tr_df = tr_df.drop(columns=["Serial No."])

X = tr_df.iloc[:, :-1].copy().values
y = tr_df.iloc[:, -1].copy().values

features = [i for i in range(X.shape[1])]

feature_transformer = Pipeline(
    steps=[
        ("Standard_Scaler", StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("features", feature_transformer, features),
    ]
)

best_mse = np.inf
for name in names:
    fold = 1
    if cfg.show_info: breaker()
    for tr_idx, va_idx in KFold(n_splits=cfg.n_splits, random_state=cfg.seed, shuffle=True).split(X):
        X_train, X_valid, y_train, y_valid = X[tr_idx], X[va_idx], y[tr_idx], y[va_idx]
        my_pipeline = Model(name, preprocessor, cfg.seed)
        my_pipeline.model.fit(X_train, y_train)

        y_pred = my_pipeline.model.predict(X_valid)
        if cfg.show_info:
            mae, mse, r2 = get_scores(y_pred, y_valid)
            print(f"{my_pipeline.model_name}, {fold}\n")
            print_scores(mae, mse, r2)
            print("")
        else:
            _, mse,  = get_scores(y_valid, y_pred)

        if mse < best_mse:
            best_mse = mse
            model_fold_name = f"{name}_{fold}"

            with open(os.path.join(cfg.model_save_path, f"best_model.pkl"), "wb") as fp: pickle.dump(my_pipeline.model, fp)
        
        with open(os.path.join(cfg.model_save_path, f"{name}_{fold}.pkl"), "wb") as fp: pickle.dump(my_pipeline.model, fp)
        
        fold += 1
    

if cfg.show_info: 
    breaker()
    print(f"Best Model : {model_fold_name.split('_')[0]}, Best Fold : {model_fold_name.split('_')[1]}")

breaker()


**************************************************

lir, 1

MSE : 0.04791
MAE : 0.00435
R2  : 0.75650

lir, 2

MSE : 0.06292
MAE : 0.00658
R2  : 0.49787

lir, 3

MSE : 0.04631
MAE : 0.00362
R2  : 0.75567

lir, 4

MSE : 0.04518
MAE : 0.00427
R2  : 0.67622

lir, 5

MSE : 0.03561
MAE : 0.00231
R2  : 0.86571


**************************************************

knr, 1

MSE : 0.05059
MAE : 0.00456
R2  : 0.70502

knr, 2

MSE : 0.06697
MAE : 0.00728
R2  : 0.28165

knr, 3

MSE : 0.04250
MAE : 0.00306
R2  : 0.75412

knr, 4

MSE : 0.05253
MAE : 0.00531
R2  : 0.61123

knr, 5

MSE : 0.03966
MAE : 0.00319
R2  : 0.80372


**************************************************

svr, 1

MSE : 0.06767
MAE : 0.00652
R2  : 0.44963

svr, 2

MSE : 0.07019
MAE : 0.00701
R2  : 0.25253

svr, 3

MSE : 0.05511
MAE : 0.00426
R2  : 0.63259

svr, 4

MSE : 0.05853
MAE : 0.00492
R2  : 0.61473

svr, 5

MSE : 0.07446
MAE : 0.00748
R2  : 0.51393


**************************************************

dtr, 1

MSE : 0.06906
M

In [6]:
ts_df = pd.read_csv(cfg.ts_data)
ts_df = ts_df.drop(columns=["Serial No."])
X_test = ts_df.copy().values

# model = pickle.load(open(os.path.join(cfg.model_save_path, f"best_model.pkl"), "rb"))
# y_pred = model.predict(X_test)

y_pred = np.zeros((X_test.shape[0], ))

model_names = os.listdir("models")
for model_name in model_names:
    model = pickle.load(open(os.path.join(cfg.model_save_path, model_name), "rb"))
    y_pred += model.predict(X_test)
y_pred = y_pred / len(model_names)

ss_df = pd.read_csv(cfg.ss_data)
ss_df["Chance of Admit"] = y_pred
ss_df.to_csv("submission.csv", index=False)

In [7]:
ss_df.head(5)

Unnamed: 0,Serial No.,Chance of Admit
0,1,0.697858
1,2,0.690512
2,3,0.681256
3,4,0.602429
4,5,0.66981
