<a href="https://www.kaggle.com/code/mmellinger66/ps3e2-model-lab?scriptVersionId=118985312" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Playground Season 3: Episode 2 - Stroke Predictions</h1>
</div>

This kernel will focus on combining several models into a blend.  Since this exercise only lasts a week, this kernel might be a little terse.  The main goal is to build a framework whereby I can evaluate and blend several different models, and hopefully become more competitive.

## Problem Type

Binary Classification

## Evaluation Metric

[AUC](https://www.analyticsvidhya.com/blog/2020/06/auc-roc-curve-machine-learning/)

## Resources

### Discussions

### Notebooks




<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [1]:
from typing import List, Set, Dict, Tuple, Optional

import os
import time
from pathlib import Path
import glob
import gc

import pandas as pd
import numpy as np

# from sklearn.model_selection import train_test_split
# from sklearn.impute import SimpleImputer
# from sklearn.model_selection import KFold, StratifiedKFold

from sklearn import impute
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import cluster
from sklearn import model_selection
from sklearn import ensemble
from sklearn import datasets

import xgboost as xgb
import catboost as cb
import lightgbm as lgb

from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Visualization Libraries
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import missingno as msno
from folium import Map
from folium.plugins import HeatMap



<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [2]:
TARGET="stroke"

In [3]:
class Config:
    path:str = "../input/playground-series-s3e2/"
    gpu:bool = False
    fast_render:bool = False
    calc_probability:bool = True
    debug:bool = False
    seed:int = 42
    N_ESTIMATORS:int = 5000  # 100, 300, 1000, 2000, 5000, 15_000, 20_000 GBDT
    GPU_N_ESTIMATORS:int = 1000 # Want models to run fast during dev
    N_FOLDS:int = 10

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Library</h1>
</div>

Creating a few functions that will be reused in each project.

I need to be better with [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) on Kaggle.

In [4]:
def read_data(path: str, analyze:bool=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    data_dir = Path(path)

    train = pd.read_csv(data_dir / "train.csv")
    test = pd.read_csv(data_dir / "test.csv")
    submission_df = pd.read_csv(data_dir / "sample_submission.csv")

    if analyze:
        print("=== Shape of Data ===")
        print(f" train data: Rows={train.shape[0]}, Columns={train.shape[1]}")
        print(f" test data : Rows={test.shape[0]}, Columns={test.shape[1]}")

        print("\n=== Train Data: First 5 Rows ===\n")
        display(train.head())
        print("\n=== Train Column Names ===\n")
        display(train.columns)
        print("\n=== Features/Explanatory Variables ===\n")
        eval_features(train)
        print("\n === Skewness ===\n")
        check_skew(train)
    return train, test, submission_df

def create_submission(model_name: str, target, preds, seed:int=42, nfolds:int=5) -> pd.DataFrame:
    sample_submission[target] = preds

    if len(model_name) > 0:
        fname = f"submission_{model_name}_k{nfolds}_s{seed}.csv"
    else:
        fname = "submission.csv"

    sample_submission.to_csv(fname, index=False)

    return sample_submission

def show_classification_scores(ground_truth:List[int], yhat:List[int]) -> None:
    accuracy = metrics.accuracy_score(ground_truth, yhat)
    precision = metrics.precision_score(ground_truth, yhat)
    recall = metrics.recall_score(ground_truth, yhat)
    roc = metrics.roc_auc_score(ground_truth, yhat)
    f1 = metrics.f1_score(ground_truth, yhat)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC: {roc:.4f}")
    print(f"f1: {f1:.4f}")
    

def label_encoder(train:pd.DataFrame, test:pd.DataFrame, columns:List[str]) -> (pd.DataFrame, pd.DataFrame) :
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = preprocessing.LabelEncoder().fit_transform(train[col])
        test[col] = preprocessing.LabelEncoder().fit_transform(test[col])
    return train, test   

def create_strat_folds(df:pd.DataFrame, TARGET, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"TARGET={TARGET}, n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    # kf = GroupKFold(n_splits=Config.N_FOLDS)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(df, df[TARGET])):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df


def create_folds(df:pd.DataFrame, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=seed)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(df)):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df

def show_fold_scores(scores: List[float]) -> (float, float):
    cv_score = np.mean(scores)  # Used in filename
    std_dev = np.std(scores)
    print(
        f"Scores -> Adjusted: {np.mean(scores) - np.std(scores):.8f} , mean: {np.mean(scores):.8f}, std: {np.std(scores):.8f}"
    )
    return cv_score, std_dev


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(df.select_dtypes(include=['int64', 'float64', 'uint8']).columns)
    categorical_features = list(df.select_dtypes(include=['object', 'bool']).columns)
    if display:
        print(f"Continuous Features={continuous_features}\n")
        print(f"Categorical Features={categorical_features}")
    return continuous_features, categorical_features   

def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print("=== Cardinality ===")
    print(df[features].nunique())

## === Model Support ===    

from scipy.stats import mode


def merge_test_predictions(final_test_predictions:List[float], calc_probability:bool=True) -> List[float]:

    if Config.calc_probability:
        print("Mean")
        result = np.mean(np.column_stack(final_test_predictions), axis=1)
    else:
        print("Mode")
        mode_result = mode(np.column_stack(final_test_predictions), axis=1)
        result = mode_result[0].ravel()

    return result

def summary_statistics(X:pd.DataFrame, enhanced=True) -> None:
    desc = X.describe()
    if enhanced:
        desc.loc["var"] = X.var(numeric_only=True).tolist()
        desc.loc["skew"] = X.skew(numeric_only=True).tolist()
        desc.loc["kurt"] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context("display.precision", 2):
        style = desc.transpose().style.background_gradient(
            cmap="coolwarm"
        )  # .set_precision(4)
    display(style)
    
def show_missing_features(df:pd.DataFrame) -> None:
    missing_vals = df.isna().sum().sort_values(ascending=False)
    print(missing_vals[missing_vals > 0])


def show_duplicate_records(df:pd.DataFrame) -> None:
    dups = df.duplicated()
    print(dups.sum())


def eval_features(df:pd.DataFrame) -> (List[str], List[str], List[str]):
    ## Separate Categorical and Numerical Features
    categorical_features = list(
        df.select_dtypes(include=["category", "object"]).columns
    )
    continuous_features = list(df.select_dtypes(include=["number"]).columns)

    print(f"Continuous features: {continuous_features}")
    print(f"Categorical features: {categorical_features}")
    print("\n --- Cardinality of Categorical Features ---\n")

    for feature in categorical_features:
        cardinality = df[feature].nunique()
        if cardinality < 10:
            print(f"{feature}: cardinality={cardinality}, {df[feature].unique()}")
        else:
            print(f"{feature}: cardinality={cardinality}")
    all_features = categorical_features + continuous_features
    return all_features, categorical_features, continuous_features


def show_feature_importance(feature_importance_lst:List[str]) -> None:
    fis_df = pd.concat(feature_importance_lst, axis=1)

    fis_df.sort_values("0_importance", ascending=True).head(40).plot(
        kind="barh", figsize=(12, 12), title="Feature Importance Across Folds"
    )
    plt.show()


def show_feature_target_crosstab(df:pd.DataFrame, feature_lst:List[str], target:str) -> None:
    for feature in feature_lst:
        print(f"\n=== {feature} vs {target} ===\n")
        display(
            pd.crosstab(df[feature], df[target], margins=True)
        )  # display keeps bold formatting


def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print("=== Cardinality ===")
    print(df[features].nunique())


def show_unique_features(df:pd.DataFrame, features:List[str]) -> None:
    for col in features:
        print(col, sorted(df[col].dropna().unique()))


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(
        df.select_dtypes(include=["int64", "float64", "uint8"]).columns
    )
    categorical_features = list(df.select_dtypes(include=["object", "bool"]).columns)
    if display:
        print(f"Continuous Features={continuous_features}\n")
        print(f"Categorical Features={categorical_features}")
    return continuous_features, categorical_features


def describe(X:pd.DataFrame) -> None:
    "Deprecated: Use summary_statistics()"
    desc = X.describe()
    desc.loc['var'] = X.var(numeric_only=True).tolist()
    desc.loc['skew'] = X.skew(numeric_only=True).tolist()
    desc.loc['kurt'] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context('display.precision', 2):
        style = desc.transpose().style.background_gradient(cmap='coolwarm') #.set_precision(4)
    display(style)
  

def check_skew(df:pd.DataFrame) -> None:
    skew = df.skew(skipna=True,numeric_only=True).sort_values(ascending=False)
    print(skew)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data and Analyze</h1>
</div>

## Load the following files

 - train.csv - Data used to build our machine learning model
 - test.csv - Data used to build our machine learning model. Does not contain the target variable
 - sample_submission.csv - A file in the proper format to submit test predictions

In [5]:
%%time
train, test, sample_submission = read_data(Config.path)

=== Shape of Data ===
 train data: Rows=15304, Columns=12
 test data : Rows=10204, Columns=11

=== Train Data: First 5 Rows ===



Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0
1,1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0
2,2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown,0
3,3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0
4,4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0



=== Train Column Names ===



Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')


=== Features/Explanatory Variables ===

Continuous features: ['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke']
Categorical features: ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

 --- Cardinality of Categorical Features ---

gender: cardinality=3, ['Male' 'Female' 'Other']
ever_married: cardinality=2, ['Yes' 'No']
work_type: cardinality=5, ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
Residence_type: cardinality=2, ['Urban' 'Rural']
smoking_status: cardinality=4, ['never smoked' 'formerly smoked' 'Unknown' 'smokes']

 === Skewness ===

heart_disease        6.316649
stroke               4.611121
hypertension         4.143196
avg_glucose_level    3.037242
bmi                  0.713051
id                   0.000000
age                 -0.103579
dtype: float64
CPU times: user 83.5 ms, sys: 16.6 ms, total: 100 ms
Wall time: 162 ms


## Categorical/Numerical Variables

In [6]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features.remove(TARGET)
cont_features.remove("id")
FEATURES = cont_features + cat_features
FEATURES

Continuous Features=['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke']

Categorical Features=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
=== Cardinality ===
gender            3
ever_married      2
work_type         5
Residence_type    2
smoking_status    4
dtype: int64


['age',
 'hypertension',
 'heart_disease',
 'avg_glucose_level',
 'bmi',
 'gender',
 'ever_married',
 'work_type',
 'Residence_type',
 'smoking_status']

In [7]:
summary_statistics(train.drop(columns=["id"], axis=1), enhanced=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var,skew,kurt
age,15304.0,41.42,21.44,0.08,26.0,43.0,57.0,82.0,459.87,-0.1,-0.81
hypertension,15304.0,0.05,0.22,0.0,0.0,0.0,0.0,1.0,0.05,4.14,15.17
heart_disease,15304.0,0.02,0.15,0.0,0.0,0.0,0.0,1.0,0.02,6.32,37.91
avg_glucose_level,15304.0,89.04,25.48,55.22,74.9,85.12,96.98,267.6,649.03,3.04,13.18
bmi,15304.0,28.11,6.72,10.3,23.5,27.6,32.0,80.1,45.19,0.71,1.36
stroke,15304.0,0.04,0.2,0.0,0.0,0.0,0.0,1.0,0.04,4.61,19.26


In [8]:
excluded_features = [TARGET, "id", "fold", "Residence_type"]

### Encode Categorical Features

In [9]:
cat_features

['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

In [10]:
train, test = label_encoder(train, test, cat_features)
# X_test = pd.get_dummies(test[FEATURES], drop_first=True)

train.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,1,28.0,0,0,1,2,1,79.53,31.1,2,0
1,1,1,33.0,0,0,1,2,0,78.44,23.9,1,0
2,2,0,42.0,0,0,1,2,0,103.0,40.3,0,0
3,3,1,56.0,0,0,1,2,1,64.87,28.8,2,0
4,4,0,24.0,0,0,0,2,0,73.36,28.8,2,0


In [11]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features = [feature for feature in cont_features if feature not in excluded_features]
cat_features = [feature for feature in cat_features if feature not in excluded_features]

FEATURES = cont_features + cat_features
FEATURES

Continuous Features=['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']

Categorical Features=[]
=== Cardinality ===
Series([], dtype: float64)


['gender',
 'age',
 'hypertension',
 'heart_disease',
 'ever_married',
 'work_type',
 'avg_glucose_level',
 'bmi',
 'smoking_status']

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Train Models with Cross Validation</h1>
</div>

In [12]:
# train = create_folds(train, Config.N_FOLDS)
train = create_strat_folds(train, TARGET, Config.N_FOLDS)

TARGET=stroke, n_folds=10, seed=42


In [13]:
def gpu_ify_lgbm(lgbm_dict):
    lgbm_dict["device"] = "gpu"
    lgbm_dict["boosting_type"] = "gbdt"
    lgbm_dict["gpu_platform_id"] = 0
    lgbm_dict["gpu_device_id"] = 0
    return lgbm_dict


In [14]:
lgbm_params01 = {
    'objective': 'binary',
     'metric': 'auc',
     'feature_pre_filter': False,
     'lambda_l1': 1.9488299167684667e-07,
     'lambda_l2': 9.456184670156514,
     'num_leaves': 6,
     'feature_fraction': 0.8,
     'bagging_fraction': 0.8065,
     'bagging_freq': 4,
     'min_child_samples': 10,
     'num_iterations': 400,
     'learning_rate':0.05
}

if Config.gpu:
    lgbm_params01 = gpu_ify_lgbm(lgbm_params01)


In [15]:
lgbm_params = {
    'objective': 'binary', # regression, auc
    'metric': 'auc',
    "n_estimators": Config.N_ESTIMATORS, # N_ESTIMATORS, GPU_N_ESTIMATORS
    'max_depth': 9,
    'learning_rate': 0.01,
    'min_data_in_leaf': 36, 
    'num_leaves': 100, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.89, 
    'bagging_freq': 5, 
    'lambda_l2': 10,
    
    'seed': Config.seed,
#     'boosting_type': 'gbdt',
#     'device': 'gpu', 
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'n_jobs': -1,
#    'metric': 'rmse',
    'verbose': -1
}

if Config.gpu:
    lgbm_params["device"] = "gpu"
    lgbm_params["boosting_type"] = "gbdt"
    lgbm_params["gpu_platform_id"] = 0
    lgbm_params["gpu_device_id"] = 0




In [16]:
cb_params = {
#     "objective": "binary",
    "eval_metric": "AUC",
    "learning_rate": 0.05,
    "l2_leaf_reg": 3.1572972266001518,
    "bagging_temperature": 0.6799604234141348,
    "random_strength": 1.99590400593318,
    "depth": 9,
    "min_data_in_leaf": 93,
    "iterations": Config.N_ESTIMATORS, #Config.N_ESTIMATORS,GPU_N_ESTIMATORS
    "use_best_model": True,
    #     "task_type": "GPU",
    "random_seed": Config.seed,
}

if Config.gpu:
    cb_params["task_type"] = "GPU"

In [17]:
model_reg_dict = {
    "lgbm1": lgb.LGBMRegressor(**lgbm_params),


}

In [18]:
model_clf_dict = {
    "lgbm1": lgb.LGBMClassifier(**lgbm_params),
    "lgbm2": lgb.LGBMClassifier(**lgbm_params01),
    "cat1": cb.CatBoostClassifier(**cb_params),
    "log_reg": linear_model.LogisticRegression(),
    "log_reg2": linear_model.LogisticRegression(
        max_iter=1000, C=0.0001, penalty="l2", solver="newton-cg"
    ),
    "log_reg3": linear_model.LogisticRegression(
        max_iter=1000, C=0.2, penalty="l1", solver = 'saga'
    ),
    "svc": svm.SVC(C = 100, gamma = 1, kernel = 'rbf', probability = True),
    "rfc": ensemble.RandomForestClassifier(max_depth = 7, min_samples_leaf = 5, min_samples_split = 2, n_estimators = 300)

}

In [19]:
all_cv_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
        "RunTime": pd.Series(dtype="float"),
    }
)

oof = train[["id", TARGET, "fold"]].copy().reset_index(drop=True).copy()
oof.set_index("id", inplace=True)
oof.head()

Unnamed: 0_level_0,stroke,fold
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,8
1,0,0
2,0,9
3,0,9
4,0,1


In [20]:
def show_tree_model_fi(model, features:List[str]) -> None:
    print("\n=== Model Feature Importance ===")
    for i in model.feature_importances_.argsort()[::-1]:
        print(features[i], model.feature_importances_[i]/model.feature_importances_.sum())

def save_oof_predictions(model_name:str, final_valid_predictions, oof:pd.DataFrame) -> pd.DataFrame:
    final_valid_predictions_df = process_valid_predictions(
        final_valid_predictions, "id", model_name
    )
    display(final_valid_predictions_df.head())
    oof[f"pred_{model_name}"] = final_valid_predictions_df[f"pred_{model_name}"]

    return oof

def save_test_predictions(model_name:str, final_test_predictions, submission_df:pd.DataFrame, result_field:str=TARGET) -> None:
    result = merge_test_predictions(final_test_predictions, Config.calc_probability)
    # result[:20]
    submission_df[f"target_{model_name}"] = result
    #     submission_df.head(10)
    ss = submission_df[["id", f"target_{model_name}"]].copy().reset_index(drop=True)
    ss.rename(columns={f"target_{model_name}": result_field}, inplace=True)
    ss.to_csv(
        f"submission_{model_name}.csv", index=False
    )  # Can submit the individual model
    ss.head(10)

def process_valid_predictions(final_valid_predictions, train_id, model_name:str) -> pd.DataFrame:
    model = f"pred_{model_name}"
    final_valid_predictions_df = pd.DataFrame.from_dict(
        final_valid_predictions, orient="index"
    ).reset_index()
    final_valid_predictions_df.columns = [train_id, model]
    final_valid_predictions_df.set_index(train_id, inplace=True)
    final_valid_predictions_df.sort_index(inplace=True)
    final_valid_predictions_df.to_csv(f"train_pred_{model_name}.csv", index=True)

    return final_valid_predictions_df

def add_score(score_df:pd.DataFrame, model_name:str, score:float, std:float):
    dict1 = {"Model": model_name, "Score": cv_score, "StdDev": std_dev}
    score_df = score_df.append(dict1, ignore_index=True)
    return score_df

In [21]:
def train_cv_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid,
    params,
    n_folds:int=5,
    seed:int=42,
):

    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        scaler = preprocessing.StandardScaler()
#         scaler = preprocessing.MinMaxScaler()
        xtrain = scaler.fit(xtrain).transform(xtrain)
        xvalid = scaler.transform(xvalid)
        xtest = scaler.transform(xtest)

        model = get_model_fn # ()

        model.fit(
            xtrain,
            ytrain,
        )
        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))

        # fold_score = metrics.accuracy_score(yvalid, preds_valid_class)  # Validation Set Score
        fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
        show_classification_scores(yvalid.values, preds_valid_class)

#         fold_score = metrics.roc_auc_score(yvalid, preds_valid)  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)
        #         importance_list.append(model.coef_.ravel())

        fi = []
        # Feature importance
#         fi = pd.DataFrame(
#             index=FEATURES,
#             data=model.coef_.ravel(),
#             columns=[f"{fold}_importance"],
#         )
        
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )


def train_xgb_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid:str,
    params,
    n_folds:int=5,
    seed:int=42,
):

    print(params)
    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        model = get_model_fn # (params)

        model.fit(
            xtrain,
            ytrain,
            eval_set=[(xvalid, yvalid)],
            #             eval_metric="acc",  # auc
            verbose=False,
            #             early_stopping_rounds=3000,
            #             callbacks=[
            #                 xgb.log_evaluation(0),
            #                 xgb.early_stopping(500, False, True),
            #             ],
        )

        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        if Config.debug:
            print(f"GT Type: {type(yvalid.values)}")
            print(f"Preds Type: {type(preds_valid_class)}")
            print(f"         GT:{yvalid.values[:20]}")
            print(f"Preds Class:{preds_valid_class[:20]}")
            print(f"Preds Prob:{preds_valid[:20]}")
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid_class)))

        fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
        show_classification_scores(yvalid.values, preds_valid_class)
#         fold_score = metrics.mean_absolute_error(
#             yvalid, preds_valid
#         )  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)

        # Feature importance
        fi = pd.DataFrame(
            index=FEATURES,
            data=model.feature_importances_,
            columns=[f"{fold}_importance"],
        )
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )        

In [22]:
def run_linear_model(model_dict, model_name:str, features:List[str], oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_cv_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        False, #Config.calc_probability,
        "id",
        {},
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof


def run_tree_model(model_dict, model_name:str, features:List[str], params, oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_xgb_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        Config.calc_probability,
        "id",
        params,
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)
    show_tree_model_fi(model, features)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof

In [23]:
%%time

def run_models4features(model_dict, model_lst:List[str], target:str, feature_lst:List[str], all_cv_scores:pd.DataFrame, linear_models:bool=True) -> pd.DataFrame:

    oof = train[["id", target, "fold"]].copy().reset_index(drop=True).copy()
    oof.set_index("id", inplace=True)

    for idx, m in enumerate(model_lst):
        model = model_lst[idx]
        start_time = time.time()

        print(f"Model={model}")

        params = {}
        if linear_models:
                cv_score, std_dev, oof = run_linear_model(model_dict, model, feature_lst, oof)

        else:
            cv_score, std_dev, oof = run_tree_model(model_dict, model, feature_lst, params, oof)

        run_time = time.time() - start_time

        score_dict = {"Model": model, "Score": cv_score, "StdDev": std_dev, "RunTime": run_time}
        all_cv_scores = all_cv_scores.append(score_dict, ignore_index=True)
        print(f"Model Run Time: {run_time:.2f}")

    return all_cv_scores




CPU times: user 20 µs, sys: 0 ns, total: 20 µs
Wall time: 25.7 µs


## Tree Models

In [24]:
%%time
# model_lst = ["cat2","lgbm2", "xgbr","lgbm1", "cat1"]
model_lst = ["lgbm2", "lgbm1", "cat1"]

all_cv_scores = run_models4features(model_clf_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=False)    

all_cv_scores.sort_values(by=["Score"], ascending=False)

Model=lgbm2
{}




Accuracy: 0.9569
Precision: 0.3333
Recall: 0.0476
ROC: 0.5218
f1: 0.0833
fold: 1, Score: 0.8918191254703517, Run Time: 0.71




Accuracy: 0.9582
Precision: 0.3333
Recall: 0.0159
ROC: 0.5073
f1: 0.0303
fold: 2, Score: 0.8753946628606029, Run Time: 0.59




Accuracy: 0.9595
Precision: 0.7500
Recall: 0.0469
ROC: 0.5231
f1: 0.0882
fold: 3, Score: 0.8857734747784595, Run Time: 0.57




Accuracy: 0.9589
Precision: 0.5714
Recall: 0.0625
ROC: 0.5302
f1: 0.1127
fold: 4, Score: 0.889229720518064, Run Time: 0.58




Accuracy: 0.9588
Precision: 0.5000
Recall: 0.0317
ROC: 0.5152
f1: 0.0597
fold: 5, Score: 0.9004663442291253, Run Time: 0.53




Accuracy: 0.9588
Precision: 0.5000
Recall: 0.0635
ROC: 0.5304
f1: 0.1127
fold: 6, Score: 0.8746713409290097, Run Time: 0.63




Accuracy: 0.9608
Precision: 0.8000
Recall: 0.0635
ROC: 0.5314
f1: 0.1176
fold: 7, Score: 0.8877960636651844, Run Time: 0.58




Accuracy: 0.9588
Precision: 0.5000
Recall: 0.0159
ROC: 0.5076
f1: 0.0308
fold: 8, Score: 0.884344467166553, Run Time: 0.53




Accuracy: 0.9569
Precision: 0.3333
Recall: 0.0476
ROC: 0.5218
f1: 0.0833
fold: 9, Score: 0.8996656604018567, Run Time: 0.53




Accuracy: 0.9569
Precision: 0.3333
Recall: 0.0476
ROC: 0.5218
f1: 0.0833
fold: 10, Score: 0.8906741974226636, Run Time: 0.53
Scores -> Adjusted: 0.87980027 , mean: 0.88798351, std: 0.00818324

=== Model Feature Importance ===
avg_glucose_level 0.326
bmi 0.2535
age 0.2385
smoking_status 0.053
work_type 0.0335
ever_married 0.0275
gender 0.026
hypertension 0.0215
heart_disease 0.0205


Unnamed: 0_level_0,pred_lgbm2
id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


Mean
Model Run Time: 5.85
Model=lgbm1
{}




Accuracy: 0.9556
Precision: 0.4138
Recall: 0.1905
ROC: 0.5894
f1: 0.2609
fold: 1, Score: 0.8638142813892131, Run Time: 21.18




Accuracy: 0.9536
Precision: 0.2143
Recall: 0.0476
ROC: 0.5201
f1: 0.0779
fold: 2, Score: 0.857218545910644, Run Time: 21.34




Accuracy: 0.9543
Precision: 0.3125
Recall: 0.0781
ROC: 0.5353
f1: 0.1250
fold: 3, Score: 0.863507583503749, Run Time: 20.18




Accuracy: 0.9556
Precision: 0.3750
Recall: 0.0938
ROC: 0.5435
f1: 0.1500
fold: 4, Score: 0.861153715064758, Run Time: 20.67




Accuracy: 0.9601
Precision: 0.5455
Recall: 0.1905
ROC: 0.5918
f1: 0.2824
fold: 5, Score: 0.8912909403706949, Run Time: 21.05




Accuracy: 0.9575
Precision: 0.4375
Recall: 0.1111
ROC: 0.5525
f1: 0.1772
fold: 6, Score: 0.8436394325964878, Run Time: 19.91




Accuracy: 0.9601
Precision: 0.5625
Recall: 0.1429
ROC: 0.5690
f1: 0.2278
fold: 7, Score: 0.8633211066748899, Run Time: 22.18




Accuracy: 0.9569
Precision: 0.3846
Recall: 0.0794
ROC: 0.5370
f1: 0.1316
fold: 8, Score: 0.8649224743294273, Run Time: 20.92




Accuracy: 0.9542
Precision: 0.2941
Recall: 0.0794
ROC: 0.5356
f1: 0.1250
fold: 9, Score: 0.8599777106934572, Run Time: 19.99




Accuracy: 0.9529
Precision: 0.2353
Recall: 0.0635
ROC: 0.5273
f1: 0.1000
fold: 10, Score: 0.8678979885523853, Run Time: 20.86
Scores -> Adjusted: 0.85251490 , mean: 0.86367438, std: 0.01115948

=== Model Feature Importance ===
bmi 0.3217709092541995
avg_glucose_level 0.3164652280993848
age 0.2341200893952007
smoking_status 0.05588927153964926
work_type 0.03316050721759165
gender 0.012832425449307601
hypertension 0.010227943944925924
heart_disease 0.00887734411970943
ever_married 0.0066562809800311575


Unnamed: 0_level_0,pred_lgbm1
id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


Mean
Model Run Time: 208.42
Model=cat1
{}


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9589
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 1, Score: 0.8819038968902728, Run Time: 67.37
Accuracy: 0.9582
Precision: 0.3333
Recall: 0.0159
ROC: 0.5073
f1: 0.0303
fold: 2, Score: 0.8796656719000043, Run Time: 68.82
Accuracy: 0.9595
Precision: 0.6667
Recall: 0.0625
ROC: 0.5306
f1: 0.1143
fold: 3, Score: 0.881497102931152, Run Time: 69.86
Accuracy: 0.9589
Precision: 1.0000
Recall: 0.0156
ROC: 0.5078
f1: 0.0308
fold: 4, Score: 0.8920415814587594, Run Time: 71.86
Accuracy: 0.9588
Precision: 0.5000
Recall: 0.0159
ROC: 0.5076
f1: 0.0308
fold: 5, Score: 0.8986810356953505, Run Time: 71.76


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9588
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 6, Score: 0.8985674251522922, Run Time: 70.50
Accuracy: 0.9588
Precision: 0.5000
Recall: 0.0159
ROC: 0.5076
f1: 0.0308
fold: 7, Score: 0.8822670172363424, Run Time: 70.58
Accuracy: 0.9595
Precision: 1.0000
Recall: 0.0159
ROC: 0.5079
f1: 0.0312
fold: 8, Score: 0.8770192921522164, Run Time: 70.09
Accuracy: 0.9562
Precision: 0.3000
Recall: 0.0476
ROC: 0.5214
f1: 0.0822
fold: 9, Score: 0.8827106393568561, Run Time: 69.51
Accuracy: 0.9588
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 10, Score: 0.8751636532822626, Run Time: 69.42
Scores -> Adjusted: 0.87692507 , mean: 0.88495173, std: 0.00802666

=== Model Feature Importance ===
age 0.5075602724386779
ever_married 0.13710186684994718
bmi 0.07765347167338886
work_type 0.06988682643228178
smoking_status 0.06774291268501507
avg_glucose_level 0.0636498951112942
gender 0.046772613705863685
hypertension 0.020356050060523087
heart_disease 0.0092760910430

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0_level_0,pred_cat1
id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


Mean
Model Run Time: 699.84
CPU times: user 42min 38s, sys: 6min 43s, total: 49min 22s
Wall time: 15min 14s


Unnamed: 0,Model,Score,StdDev,RunTime
0,lgbm2,0.887984,0.008183,5.850267
2,cat1,0.884952,0.008027,699.839236
1,lgbm1,0.863674,0.011159,208.420835


### Linear Models

In [25]:
%%time
model_lst = ["svc", "log_reg", "log_reg2", "log_reg3"]
# model_lst = []
all_cv_scores = run_models4features(model_clf_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    

Model=svc
Accuracy: 0.9314
Precision: 0.1379
Recall: 0.1270
ROC: 0.5465
f1: 0.1322
fold: 1, Score: 0.54646209074002, Run Time: 19.28
Accuracy: 0.9399
Precision: 0.1282
Recall: 0.0794
ROC: 0.5281
f1: 0.0980
fold: 2, Score: 0.52810215821115, Run Time: 19.83
Accuracy: 0.9484
Precision: 0.2414
Recall: 0.1094
ROC: 0.5472
f1: 0.1505
fold: 3, Score: 0.5471892041581459, Run Time: 19.92
Accuracy: 0.9393
Precision: 0.1463
Recall: 0.0938
ROC: 0.5349
f1: 0.1143
fold: 4, Score: 0.5349458929788685, Run Time: 18.90
Accuracy: 0.9431
Precision: 0.1471
Recall: 0.0794
ROC: 0.5298
f1: 0.1031
fold: 5, Score: 0.5297984224364592, Run Time: 19.53
Accuracy: 0.9386
Precision: 0.1395
Recall: 0.0952
ROC: 0.5350
f1: 0.1132
fold: 6, Score: 0.5350082773395658, Run Time: 19.98
Accuracy: 0.9366
Precision: 0.1458
Recall: 0.1111
ROC: 0.5416
f1: 0.1261
fold: 7, Score: 0.5415814587593729, Run Time: 19.55
Accuracy: 0.9412
Precision: 0.1351
Recall: 0.0794
ROC: 0.5288
f1: 0.1000
fold: 8, Score: 0.5287759275489337, Run Time: 

Unnamed: 0_level_0,pred_svc
id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


Mean
Model Run Time: 195.06
Model=log_reg
Accuracy: 0.9562
Precision: 0.0000
Recall: 0.0000
ROC: 0.4986
f1: 0.0000
fold: 1, Score: 0.4986376021798365, Run Time: 0.08
Accuracy: 0.9569
Precision: 0.2000
Recall: 0.0159
ROC: 0.5066
f1: 0.0294
fold: 2, Score: 0.5065741101163445, Run Time: 0.10
Accuracy: 0.9582
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 3, Score: 0.5, Run Time: 0.09
Accuracy: 0.9575
Precision: 0.4000
Recall: 0.0312
ROC: 0.5146
f1: 0.0580
fold: 4, Score: 0.5146025051124744, Run Time: 0.09


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9601
Precision: 0.7500
Recall: 0.0476
ROC: 0.5235
f1: 0.0896
fold: 5, Score: 0.5234686921803486, Run Time: 0.10
Accuracy: 0.9608
Precision: 0.8000
Recall: 0.0635
ROC: 0.5314
f1: 0.1176
fold: 6, Score: 0.5314052001168565, Run Time: 0.10
Accuracy: 0.9582
Precision: 0.3333
Recall: 0.0159
ROC: 0.5073
f1: 0.0303
fold: 7, Score: 0.5072548446781575, Run Time: 0.09
Accuracy: 0.9601
Precision: 1.0000
Recall: 0.0317
ROC: 0.5159
f1: 0.0615
fold: 8, Score: 0.5158730158730158, Run Time: 0.10
Accuracy: 0.9562
Precision: 0.1667
Recall: 0.0159
ROC: 0.5062
f1: 0.0290
fold: 9, Score: 0.5062323497906319, Run Time: 0.09
Accuracy: 0.9575
Precision: 0.3750
Recall: 0.0476
ROC: 0.5221
f1: 0.0845
fold: 10, Score: 0.522105365663648, Run Time: 0.09
Scores -> Adjusted: 0.50243709 , mean: 0.51261537, std: 0.01017828


Unnamed: 0_level_0,pred_log_reg
id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


Mean
Model Run Time: 1.07
Model=log_reg2
Accuracy: 0.9589
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 1, Score: 0.5, Run Time: 0.10


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9589
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 2, Score: 0.5, Run Time: 0.11
Accuracy: 0.9582
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 3, Score: 0.5, Run Time: 0.13


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9582
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 4, Score: 0.5, Run Time: 0.15
Accuracy: 0.9588
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 5, Score: 0.5, Run Time: 0.14


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9588
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 6, Score: 0.5, Run Time: 0.19


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9588
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 7, Score: 0.5, Run Time: 0.26
Accuracy: 0.9588
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 8, Score: 0.5, Run Time: 0.19


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9588
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 9, Score: 0.5, Run Time: 0.11
Accuracy: 0.9588
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 10, Score: 0.5, Run Time: 0.14
Scores -> Adjusted: 0.50000000 , mean: 0.50000000, std: 0.00000000


Unnamed: 0_level_0,pred_log_reg2
id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


Mean
Model Run Time: 1.66
Model=log_reg3
Accuracy: 0.9562
Precision: 0.0000
Recall: 0.0000
ROC: 0.4986
f1: 0.0000
fold: 1, Score: 0.4986376021798365, Run Time: 0.10
Accuracy: 0.9569
Precision: 0.2000
Recall: 0.0159
ROC: 0.5066
f1: 0.0294
fold: 2, Score: 0.5065741101163445, Run Time: 0.13
Accuracy: 0.9582
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 3, Score: 0.5, Run Time: 0.15


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9575
Precision: 0.4000
Recall: 0.0312
ROC: 0.5146
f1: 0.0580
fold: 4, Score: 0.5146025051124744, Run Time: 0.15
Accuracy: 0.9601
Precision: 0.7500
Recall: 0.0476
ROC: 0.5235
f1: 0.0896
fold: 5, Score: 0.5234686921803486, Run Time: 0.14
Accuracy: 0.9608
Precision: 0.8000
Recall: 0.0635
ROC: 0.5314
f1: 0.1176
fold: 6, Score: 0.5314052001168565, Run Time: 0.15
Accuracy: 0.9575
Precision: 0.0000
Recall: 0.0000
ROC: 0.4993
f1: 0.0000
fold: 7, Score: 0.4993183367416496, Run Time: 0.14
Accuracy: 0.9601
Precision: 1.0000
Recall: 0.0317
ROC: 0.5159
f1: 0.0615
fold: 8, Score: 0.5158730158730158, Run Time: 0.14
Accuracy: 0.9562
Precision: 0.1667
Recall: 0.0159
ROC: 0.5062
f1: 0.0290
fold: 9, Score: 0.5062323497906319, Run Time: 0.15
Accuracy: 0.9582
Precision: 0.4286
Recall: 0.0476
ROC: 0.5224
f1: 0.0857
fold: 10, Score: 0.5224461972928232, Run Time: 0.15
Scores -> Adjusted: 0.50097065 , mean: 0.51185580, std: 0.01088515


Unnamed: 0_level_0,pred_log_reg3
id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


Mean
Model Run Time: 1.53
CPU times: user 3min 22s, sys: 6.14 s, total: 3min 29s
Wall time: 3min 19s


In [26]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
0,lgbm2,0.887984,0.008183,5.850267
2,cat1,0.884952,0.008027,699.839236
1,lgbm1,0.863674,0.011159,208.420835
3,svc,0.53629,0.007356,195.055802
4,log_reg,0.512615,0.010178,1.068521
6,log_reg3,0.511856,0.010885,1.53108
5,log_reg2,0.5,0.0,1.660488


## Average Models

In [27]:
sample_submission[TARGET] = (
#     (sample_submission["target_xgbr"]  )
    + (sample_submission["target_lgbm1"])
    + (sample_submission["target_lgbm2"]*2)
    + (sample_submission["target_cat1"] )
    + (sample_submission["target_svc"] )
    + (sample_submission["target_log_reg3"] )
#     + (sample_submission["target_cat2"] )
)/6

In [28]:
sample_submission[["id", TARGET]].to_csv("submission_wt_avg.csv", index=False)
sample_submission[["id", TARGET]].tail(8)

Unnamed: 0,id,stroke
10196,25500,0.001546
10197,25501,0.004241
10198,25502,0.022026
10199,25503,0.00211
10200,25504,0.012275
10201,25505,0.000998
10202,25506,0.004403
10203,25507,0.000802


In [29]:
sample_submission[TARGET].value_counts()

0.012539    2
0.001236    2
0.000908    2
0.001347    2
0.000983    2
           ..
0.014277    1
0.522103    1
0.002885    1
0.029810    1
0.000802    1
Name: stroke, Length: 10196, dtype: int64