In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv("data/train.csv")
X = df_train.iloc[:, :-1].values
y = df_train.iloc[:, -1].values

df_kaggle = pd.read_csv("data/test.csv")
X_kaggle = df_kaggle.iloc[:, :].values

In [3]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
X[:, 24:30] = imputer.fit_transform(X[:, 24:30])

imputer_kaggle = SimpleImputer(missing_values=np.nan, strategy="mean")
X_kaggle[:, 24:30] = imputer_kaggle.fit_transform(X_kaggle[:, 24:30])

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0, 1, 2])], remainder="passthrough")
X = np.array(ct.fit_transform(X))
X_kaggle = np.array(ct.transform(X_kaggle))

In [5]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train = X
y_train = y

In [6]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train[:, 33:] = sc.fit_transform(X_train[:, 33:])
# X_test[:, 33:] = sc.transform(X_test[:, 33:])

sc_kaggle = StandardScaler()
X_kaggle[:, 33:] = sc_kaggle.fit_transform(X_kaggle[:, 33:])

In [7]:
from catboost import CatBoostClassifier

classifier = CatBoostClassifier()
classifier.fit(X_train, y_train)

Learning rate set to 0.037023
0:	learn: 0.6918852	total: 164ms	remaining: 2m 43s
1:	learn: 0.6906166	total: 170ms	remaining: 1m 24s
2:	learn: 0.6892780	total: 177ms	remaining: 58.9s
3:	learn: 0.6879965	total: 183ms	remaining: 45.7s
4:	learn: 0.6869887	total: 190ms	remaining: 37.7s
5:	learn: 0.6859906	total: 196ms	remaining: 32.5s
6:	learn: 0.6849580	total: 203ms	remaining: 28.8s
7:	learn: 0.6840129	total: 209ms	remaining: 25.9s
8:	learn: 0.6831169	total: 216ms	remaining: 23.7s
9:	learn: 0.6822953	total: 222ms	remaining: 22s
10:	learn: 0.6814923	total: 228ms	remaining: 20.5s
11:	learn: 0.6807507	total: 234ms	remaining: 19.3s
12:	learn: 0.6801006	total: 241ms	remaining: 18.3s
13:	learn: 0.6794740	total: 246ms	remaining: 17.4s
14:	learn: 0.6788750	total: 252ms	remaining: 16.6s
15:	learn: 0.6782257	total: 258ms	remaining: 15.9s
16:	learn: 0.6776733	total: 264ms	remaining: 15.3s
17:	learn: 0.6769892	total: 271ms	remaining: 14.8s
18:	learn: 0.6764250	total: 277ms	remaining: 14.3s
19:	learn: 

<catboost.core.CatBoostClassifier at 0x2d0a8ed17c0>

In [8]:
# from sklearn.metrics import accuracy_score, confusion_matrix

# y_pred = classifier.predict(X_test)
# acc_sc = accuracy_score(y_test, y_pred)
# cf_mx = confusion_matrix(y_test, y_pred)
# print("Accuracy Score:", acc_sc)
# print(cf_mx)

In [9]:
from sklearn.model_selection import cross_val_score
from datetime import datetime as dt
timestamp = dt.today()

accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
print(f"Accuracy: {accuracies.mean()}")
print(f"StdDev: {accuracies.std()}")
try:
    with open("data/cross_val.log", mode="x+") as cvlog:
        cvlog.write("timestamp,accuracy,stddev")
        cvlog.write(f"\n{timestamp},{accuracies.mean()},{accuracies.std()}")
except:
    with open("data/cross_val.log", mode="a") as cvlog:
        cvlog.write(f"\n{timestamp},{accuracies.mean()},{accuracies.std()}")

Learning rate set to 0.035394
0:	learn: 0.6919100	total: 6.84ms	remaining: 6.83s
1:	learn: 0.6905886	total: 12.1ms	remaining: 6.03s
2:	learn: 0.6892910	total: 17.9ms	remaining: 5.96s
3:	learn: 0.6881602	total: 23.9ms	remaining: 5.95s
4:	learn: 0.6870347	total: 29.3ms	remaining: 5.83s
5:	learn: 0.6858870	total: 36ms	remaining: 5.96s
6:	learn: 0.6849263	total: 41.5ms	remaining: 5.88s
7:	learn: 0.6840238	total: 46.7ms	remaining: 5.79s
8:	learn: 0.6832208	total: 52.3ms	remaining: 5.75s
9:	learn: 0.6824330	total: 57.4ms	remaining: 5.68s
10:	learn: 0.6816879	total: 62.6ms	remaining: 5.63s
11:	learn: 0.6808686	total: 68.3ms	remaining: 5.63s
12:	learn: 0.6801130	total: 73.8ms	remaining: 5.6s
13:	learn: 0.6795000	total: 79.2ms	remaining: 5.58s
14:	learn: 0.6789389	total: 85.4ms	remaining: 5.61s
15:	learn: 0.6783172	total: 91.5ms	remaining: 5.63s
16:	learn: 0.6777047	total: 97.1ms	remaining: 5.61s
17:	learn: 0.6771071	total: 102ms	remaining: 5.58s
18:	learn: 0.6765255	total: 108ms	remaining: 5.5

In [10]:
y_kaggle = classifier.predict(X_kaggle)

In [13]:
with open(f"data/kaggle/kaggle_{timestamp.strftime('%d-%m-%Y_%Hh%M')}.csv", mode="w") as out:
    out.write("id_solicitante,inadimplente")
    for i in range(len(y_kaggle)):
        out.write(f"\n{20001+i},{y_kaggle[i]}")