In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv("data/train.csv")
X = df_train.iloc[:, :-1].values
y = df_train.iloc[:, -1].values

df_kaggle = pd.read_csv("data/test.csv")
X_kaggle = df_kaggle.iloc[:, :].values

In [3]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
X[:, 24:30] = imputer.fit_transform(X[:, 24:30])

imputer_kaggle = SimpleImputer(missing_values=np.nan, strategy="mean")
X_kaggle[:, 24:30] = imputer_kaggle.fit_transform(X_kaggle[:, 24:30])

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0, 1, 2])], remainder="passthrough")
X = np.array(ct.fit_transform(X))
X_kaggle = np.array(ct.transform(X_kaggle))

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [11]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train[:, 33:] = sc.fit_transform(X_train[:, 33:])
X_test[:, 33:] = sc.transform(X_test[:, 33:])

sc_kaggle = StandardScaler()
X_kaggle[:, 33:] = sc_kaggle.fit_transform(X_kaggle[:, 33:])

In [12]:
from catboost import CatBoostClassifier

classifier = CatBoostClassifier()
classifier.fit(X_train, y_train)

Learning rate set to 0.033658
0:	learn: 0.6919986	total: 154ms	remaining: 2m 33s
1:	learn: 0.6908382	total: 159ms	remaining: 1m 19s
2:	learn: 0.6896347	total: 166ms	remaining: 55s
3:	learn: 0.6884928	total: 171ms	remaining: 42.6s
4:	learn: 0.6875205	total: 176ms	remaining: 35.1s
5:	learn: 0.6865347	total: 182ms	remaining: 30.2s
6:	learn: 0.6855482	total: 188ms	remaining: 26.7s
7:	learn: 0.6846048	total: 193ms	remaining: 24s
8:	learn: 0.6837919	total: 199ms	remaining: 21.9s
9:	learn: 0.6829916	total: 204ms	remaining: 20.2s
10:	learn: 0.6822366	total: 209ms	remaining: 18.8s
11:	learn: 0.6815112	total: 214ms	remaining: 17.7s
12:	learn: 0.6808280	total: 220ms	remaining: 16.7s
13:	learn: 0.6801484	total: 225ms	remaining: 15.9s
14:	learn: 0.6795131	total: 231ms	remaining: 15.1s
15:	learn: 0.6789768	total: 236ms	remaining: 14.5s
16:	learn: 0.6783917	total: 241ms	remaining: 13.9s
17:	learn: 0.6778305	total: 246ms	remaining: 13.4s
18:	learn: 0.6772155	total: 252ms	remaining: 13s
19:	learn: 0.67

<catboost.core.CatBoostClassifier at 0x2a956985f40>

In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix

y_pred = classifier.predict(X_test)
acc_sc = accuracy_score(y_test, y_pred)
cf_mx = confusion_matrix(y_test, y_pred)
print("Accuracy Score:", acc_sc)
print(cf_mx)

Accuracy Score: 0.608
[[1151  838]
 [ 730 1281]]


In [14]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
print(f"Accuracy: {accuracies.mean()}")
print(f"StdDev: {accuracies.std()}")

Learning rate set to 0.032178
0:	learn: 0.6920255	total: 5.81ms	remaining: 5.8s
1:	learn: 0.6908122	total: 11.5ms	remaining: 5.72s
2:	learn: 0.6896330	total: 16.9ms	remaining: 5.6s
3:	learn: 0.6885297	total: 22.8ms	remaining: 5.67s
4:	learn: 0.6875802	total: 28.5ms	remaining: 5.67s
5:	learn: 0.6866882	total: 34.2ms	remaining: 5.66s
6:	learn: 0.6857355	total: 39.7ms	remaining: 5.63s
7:	learn: 0.6848420	total: 44.9ms	remaining: 5.56s
8:	learn: 0.6840578	total: 50.2ms	remaining: 5.53s
9:	learn: 0.6833049	total: 55.3ms	remaining: 5.47s
10:	learn: 0.6823809	total: 60.3ms	remaining: 5.42s
11:	learn: 0.6816574	total: 64.6ms	remaining: 5.32s
12:	learn: 0.6809948	total: 69.7ms	remaining: 5.29s
13:	learn: 0.6803138	total: 74.8ms	remaining: 5.26s
14:	learn: 0.6797578	total: 79.7ms	remaining: 5.23s
15:	learn: 0.6791214	total: 85.1ms	remaining: 5.23s
16:	learn: 0.6785273	total: 90.6ms	remaining: 5.24s
17:	learn: 0.6778817	total: 95.7ms	remaining: 5.22s
18:	learn: 0.6773037	total: 101ms	remaining: 5

In [15]:
y_kaggle = classifier.predict(X_kaggle)

In [16]:
with open("data/kaggle.csv", mode="w") as out:
    out.write("id_solicitante,inadimplente")
    for i in range(len(y_kaggle)):
        out.write(f"\n{20001+i},{y_kaggle[i]}")