# Simple Logistic Regression on Fire 🔥

In [1]:
import os
import numpy as np
import pandas as pd

import seaborn as sn
import matplotlib.pyplot as plt

## Import Data

In [2]:
df_train = pd.read_csv('../input/bankruptcy-risk-prediction/train.csv')
df_test = pd.read_csv('../input/bankruptcy-risk-prediction/test.csv')
submission = pd.read_csv('../input/bankruptcy-risk-prediction/submission_example.csv')

In [3]:
df = pd.concat([df_train, df_test], axis=0, ignore_index=True)

In [4]:
cat_features = df.columns[df.dtypes == 'object'].to_list()
cat_features.append('bankruptcy')
num_features = df.columns[df.dtypes != 'object'].to_list()
num_features.remove('bankruptcy')
num_features.remove('id')

## Data Preparation

1. Drop the **Id** column
2. **Encode categorical variable** with numerical values (aggregated percentage of bankruptcy)
3. **Standardize** data

In [5]:
#drop the id column

df1 = df.drop(['id'], axis=1).copy()

In [6]:
#encode all the categorical variables with aggregated percentage of bankruptcy

df2 = df1.copy()

for feature in cat_features[:-1]:
    df2[feature] = df2.groupby([feature])['bankruptcy'].transform(np.mean)

In [7]:
#standardize all the numerical features

from sklearn.preprocessing import StandardScaler

df3 = df2.copy()
scaler = StandardScaler()
df3[df3.columns[:-1]] = scaler.fit_transform(df3[df3.columns[:-1]])

In [8]:
#split the data back to train and test sets

X_train = df3.iloc[:df_train.shape[0],:-1].copy()
y_train = df3.iloc[:df_train.shape[0],-1].copy()

X_test = df3.iloc[df_train.shape[0]:,:-1].copy()

## Evaluation Metric

In [9]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, cross_val_score

def kf_cross_val(model, X, y): 
    scores = []
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
        x_train = X.iloc[train_index]
        y_train = y.loc[train_index]
        x_test = X.loc[test_index]
        y_test = y.loc[test_index]
        model.fit(x_train,y_train)
        y_pred = model.predict_proba(x_test)[:,1]     
        scores.append(roc_auc_score(y_test,y_pred))       
    return scores

## Logistic Regression

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter = 2000,
                        random_state = 0)

lr_param = {'C':np.logspace(-10,10)}

lr_cv = GridSearchCV(estimator=lr, param_grid=lr_param , scoring='roc_auc', cv=5)
lr_cv.fit(X_train, y_train)
lr_cv.best_params_

{'C': 0.0022229964825261957}

In [11]:
from sklearn.model_selection import cross_validate

lr_best = LogisticRegression(C = lr_cv.best_params_['C'], 
                             max_iter = 2000, 
                             random_state = 0)

lr_best.fit(X_train, y_train)

scores = kf_cross_val(lr_best, X_train, y_train)
round(np.mean(scores), 5)

0.81034

In [12]:
lr_best.fit(X_train, y_train)
y_pred = lr_best.predict_proba(X_test)[:,1]

submission['proba'] = y_pred
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,proba
0,800,0.291856
1,801,0.254246
2,802,0.313673
3,803,0.164785
4,804,0.359552
