In [None]:
### Here a Use a Sklearn Pipeline to automate the cleaning, standardizing and training Of A Logistic Regression

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src import config, data_utils

app_train, app_test, columns_description = data_utils.get_datasets()


In [None]:
# Show distribution of target variable 

df = app_train.reset_index().groupby(['TARGET']).size().to_frame('Total')

df['Percentage'] = df['Total'].div(df['Total'].sum()).mul(100)

print(df)

sns.barplot(x='Total', y='Percentage', data=df)

In [None]:
# Use MatPlotLib to Plot For Example = the family status of the applicants.

df = app_train.groupby(['NAME_FAMILY_STATUS']).size().to_frame('TotalFS').reset_index('NAME_FAMILY_STATUS')

plt.figure(figsize=(12,8))

ax =sns.barplot(x='NAME_FAMILY_STATUS', y='TotalFS', data=df)

plt.show

In [None]:

X_train, y_train, X_test, y_test = data_utils.get_feature_target(app_train, app_test)

X_train, X_val, y_train, y_val = data_utils.get_train_val_sets(X_train, y_train)

# Set the Columns For Categorical

ordinal_features = X_train.select_dtypes(include=["object"]).nunique()==2
ordinal_columns = ordinal_features[ordinal_features].index.tolist()

onehot_features = X_train.select_dtypes(include=["object"]).nunique() > 2
onehot_columns = onehot_features[onehot_features].index.tolist()

# Set the Pipeline

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(missing_values=np.nan, strategy="median")),
    ('scaler', MinMaxScaler())],           
)

categorical_ord_transformer = Pipeline(
    steps=[
    ('imputerall', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),           
    ("ordinal", OrdinalEncoder()),
    ('scaler', MinMaxScaler()),           
    ]
)

categorical_transformer = Pipeline(
    steps=[
    ('imputerall', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),           
    ("encoder", OneHotEncoder(drop='first',sparse_output=False)),
    ('scaler', MinMaxScaler()),           
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="object") ),
        ("catord", categorical_ord_transformer, ordinal_columns ),
        ("cat", categorical_transformer, onehot_columns ),
    ]
)

# Preprocess all the data

pipe = Pipeline(
    steps=[("preprocessor", preprocessor), 
    ("classifier", LogisticRegression(C=0.0001))]
)

pipe.fit(X_train, y_train)

# Predict 

roc_auc_pipe = pipe.predict_proba(X_train)[:, 1]
roc_auc_pipe

print(f"Train ROC AUC Score: {roc_auc_pipe[4]}") 

In [15]:
### In this a Make my own experimentation process with Another Model

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier

import numpy as np
from src import config, data_utils

app_train, app_test, columns_description = data_utils.get_datasets()

# Assign to X_train all the columns from app_train except "TARGET", "DAYS_EMPLOYED", "CODE_GENDER"
X_train = app_train.drop(["DAYS_EMPLOYED","TARGET","CODE_GENDER"], axis=1)

# Assign to y_train the "TARGET" column
y_train = app_train["TARGET"]

# Assign to X_test all the columns from app_test except "TARGET"
X_test = app_test.drop(["DAYS_EMPLOYED","TARGET","CODE_GENDER"], axis=1)

# Assign to y_test the "TARGET" column
y_test = app_test["TARGET"]

# Split Train and Test Data
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42, shuffle=True)

# Set the Pipeline
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(missing_values=np.nan, strategy="median")), ("scaler", StandardScaler())],
)

categorical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
        ("encoder", OneHotEncoder(drop='first',sparse_output=False)),
        ("scaler", StandardScaler()),        
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="object")),
        ("cat", categorical_transformer, selector(dtype_include="object")),
    ]
)

# Preprocess all the data

clf = Pipeline(
    steps=[("preprocessor", preprocessor), 
    ("classifier", DecisionTreeClassifier(random_state=0, max_depth=10))]
)

# Fit The Model 

clf.fit(X_train, y_train)

# Predict
roc_auc_pipe = clf.predict_proba(X_train)[:, 1]
roc_auc_pipe

print(f"Train ROC AUC Score: {roc_auc_pipe[1]}") 


Train ROC AUC Score: 0.01378937407056915


In [None]:
#In a Plus I Train a LightGBM model

import lightgbm as lgb
import numpy as np

from src import preprocessing

In [None]:
# Preproccessing For The Train Data
train_data, val_data, test_data = preprocessing.preprocess_data(X_train, X_val, X_test)

In [None]:
### See How It Performs

gbm = lgb.LGBMClassifier(learning_rate=0.2, first_metric_only = True)

gbm.fit(train_data, y_train,eval_set =[(test_data,y_test)] , eval_metric=['auc'], callbacks=[lgb.early_stopping(stopping_rounds=100)])

best_credit_model_ever = gbm

test_preds = best_credit_model_ever.predict_proba(test_data)[:, 1]
test_preds