# Setup

In [36]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import warnings
warnings.filterwarnings('ignore')

import warnings
import datetime
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats.mstats import winsorize
# import plotly.express as px
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    OneHotEncoder, 
    KBinsDiscretizer, 
    FunctionTransformer,
    MinMaxScaler,
)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    train_test_split, 
    cross_val_score, 
    GridSearchCV, 
    validation_curve, 
    KFold,
)
from sklearn.metrics import (
    plot_confusion_matrix, 
    accuracy_score, 
    precision_score, 
    recall_score, 
    confusion_matrix, 
    f1_score,
)

# Load data

In [10]:
app_train=pd.read_csv('../data/application_train.csv')

In [23]:
app_train.columns

Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=122)

# Split data

In [45]:
X = app_train.drop("TARGET", axis = 1)
y = app_train["TARGET"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=84)

In [46]:
X_train.shape, y_train.shape

((230633, 121), (230633,))

# Pre-pre-process

In [None]:
# winsorizing amt_income
# higher_education variable
# abs value of daysbirth

# Pre-process

In [41]:
def winsorize_and_log(a):
    a = winsorize(a, limits = [.01, .99])
    a = np.log(a)
    return a

In [42]:
def categorize_education_var(a):
    a = [1 if d == "Secondary / secondary special|Lower secondary" else 0 for d in a]
    return a

In [43]:
ct = ColumnTransformer(
    [
        ( # Bora, Sailan, Kahmin
            "dummies", OneHotEncoder(), 
            ["NAME_HOUSING_TYPE", "NAME_INCOME_TYPE", "NAME_FAMILY_STATUS"]
        ), 
        
        ( # Bora, Kahmin
            "logs", FunctionTransformer(np.log), 
             ["AMT_ANNUITY", "DAYS_BIRTH"]
        ), 
        
        ( # Maha
            "winsorized", FunctionTransformer(winsorize_and_log), 
            ["AMT_INCOME_TOTAL"]
        ),
        
        ( # Lennard
            "categorize_education", FunctionTransformer(categorize_education_var), 
            ["NAME_EDUCATION_TYPE"]
        ), 
        
        ( # Sailan
            "minmax", MinMaxScaler(),
            ["AMT_CREDIT"]
        ), 
        
        ( # Lennard, Maha, Kahmin
            "passthroughs", "passthrough",
            ["NAME_CONTRACT_TYPE", "CNT_FAM_MEMBERS"]
        ), 
    ]
)

In [44]:
X_train_processed = pd.DataFrame(ct.fit_transform(X_train))
X_test_processed = pd.DataFrame(ct.fit_transform(X_test))

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

# Data overview

In [None]:
print("Empty rows: ")
print("")
df.isna().sum()

# Model

In [None]:
log_reg = LogisticRegression()

log_reg.fit(X_train_processed, y_train)

## Kfold validation

In [None]:
accuracies = []
model = []

kf = KFold(n_splits=10)
#kf.get_n_splits(features)
kf.get_n_splits()

X = df_merged_bklms
y = app_train["TARGET"]

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

lr = LogisticRegression()
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

model.append(lr)
accuracies.append(accuracy_score(predictions, y_test))

index = (accuracies == np.max(accuracies))


print('The best model has accuracy of: ', accuracies[index])
print('Coeficients are: ', model[index].coef_)