In [75]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e11/sample_submission.csv
/kaggle/input/playground-series-s5e11/train.csv
/kaggle/input/playground-series-s5e11/test.csv


In [76]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')

train_df.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [77]:
train_df = train_df.dropna(axis=0, subset=['loan_paid_back'])
# train_df = train_df.head(30000)

# print(train_df.nunique())
y = train_df["loan_paid_back"]
X_train = train_df.drop(["id", "loan_paid_back"], axis = 1)
X_test = test_df.drop(["id"], axis = 1)

In [78]:
import numpy as np

X_train['log_annual_income'] = np.log1p(X_train['annual_income'])
X_test['log_annual_income'] = np.log1p(X_test['annual_income'])

X_train['log_loan_amount'] = np.log1p(X_train['loan_amount'])
X_test['log_loan_amount'] = np.log1p(X_test['loan_amount'])


X_train['loan_to_income_ratio'] = X_train['loan_amount'] / X_train['annual_income']
X_test['loan_to_income_ratio'] = X_test['loan_amount'] / X_test['annual_income']

X_train['credit_to_loan_ratio'] = X_train['credit_score'] / X_train['loan_amount']
X_test['credit_to_loan_ratio'] = X_test['credit_score'] / X_test['loan_amount']


X_train['loan_income_interest'] = X_train['loan_to_income_ratio'] * X_train['interest_rate']
X_test['loan_income_interest'] = X_test['loan_to_income_ratio'] * X_test['interest_rate']

X_train['credit_income_interaction'] = X_train['credit_score'] * X_train['log_annual_income']
X_test['credit_income_interaction'] = X_test['credit_score'] * X_test['log_annual_income']

In [79]:
# numerical_columns = (X_train.select_dtypes(include=["number"])).columns.tolist()
numerical_columns = ['log_annual_income', 'debt_to_income_ratio', 'credit_score', 'log_loan_amount', 'interest_rate']
numerical_columns += ["loan_to_income_ratio", "credit_to_loan_ratio", "loan_income_interest", "credit_income_interaction"]
# categorical_columns = [col for col in X_train.columns if col not in numerical_columns]
categorical_columns = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade']


In [80]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from xgboost import XGBRegressor

In [81]:
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_columns),
    ("cat", categorical_transformer, categorical_columns)
])

model = RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    random_state=42
)


# model = XGBRegressor(
#     n_estimators=1000,
#     max_depth=5,
#     learning_rate=0.05,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     random_state=42
# )


clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

scores = cross_val_score(clf, X_train, y, cv=5, scoring='roc_auc')
print("CV ROC AUC:", scores.mean())

CV ROC AUC: 0.8996935934897674


In [82]:
clf.fit(X_train,y)
y_pred = clf.predict(X_test)
submission = pd.DataFrame({
    "id": test_df["id"],
    "loan_paid_back": y_pred
})

# Save to CSV
submission.to_csv("exam_score_predictions.csv", index=False)