In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd

In [1]:
# df = pd.read_csv("/cephfs/projects/ikozmin/TEST/scoring_case.csv")
df = pd.read_csv("../data/z.csv")

for column in df.columns:
    if str(df[column].dtype) == 'object':
        df[column] = pd.factorize(df[column])[0]

NameError: name 'pd' is not defined

In [None]:
df.dropna(axis=1, thresh=int(0.8 * df.shape[0]), inplace=True)
df = df.drop(["FLAG_MOBIL", "SK_ID_CURR"], axis=1)
# 1 - df.isna().sum() / len(df)

In [None]:
for column in df.columns:
    if column == 'TARGET':
        continue

    df[column] = df[column].fillna(round(df[column].mode()[0]))

In [None]:
df = df.dropna()

In [None]:
result = []
x = df.drop(["TARGET"], axis=1)
y = df["TARGET"]

random_state = 64
# Разделяем данные на обучающий и тестовый наборы
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=random_state)

count_class_0, count_class_1 = y_train.value_counts()

scale_pos_weight = count_class_0 / count_class_1

In [None]:
param_grid = {
    'max_depth': range(1, 20),
    'n_estimators': range(10, 1000, 20),
    'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2],
    'gamma': [x / 10 for x in range(10)],
    'min_child_weight': [1, 5, 10, 15, 30, 100],
    'max_delta_step': range(10),
}

model = xgb.XGBClassifier(random_state=random_state,
                          scale_pos_weight=scale_pos_weight,
                          objective='binary:logistic')

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

for i in range(1):
    grid_search = RandomizedSearchCV(estimator=model,
                                     param_distributions=param_grid,
                                     scoring='roc_auc',
                                     cv=cv,
                                     n_iter=1,
                                     random_state=i)
    grid_search.fit(x_train, y_train)
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    # Обучение
    best_model = xgb.XGBClassifier(random_state=random_state, scale_pos_weight=scale_pos_weight, **best_params)
    best_model.fit(x_train, y_train)

    result.append([roc_auc_score(y_train, best_model.predict(x_train)),
                   roc_auc_score(y_test, best_model.predict(x_test)),
                   best_params])
    print(i, roc_auc_score(y_test, best_model.predict(x_test)))
sorted(result, key=lambda g: g[1])[-1]

In [None]:
model = xgb.XGBClassifier(random_state=random_state,
                          scale_pos_weight=scale_pos_weight,
                          n_estimators=80,
                          max_depth=4,
                          learning_rate=0.0005,
                          gamma=40,
                          alpha=0.7
                          )
model.fit(x_train, y_train)
print(1)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15, 5))
title = ["train", "test"]
for i, data in enumerate([[x_train, y_train], [x_test, y_test]]):
    prediction, target_list = model.predict_proba(data[0])[:, [1]], data[1]
    logit_roc_aut = roc_auc_score(target_list, prediction)
    fpr, tpr, thresholds = roc_curve(target_list, prediction)

    axs[i].plot(fpr, tpr, logit_roc_aut)
    axs[i].plot([0, 1], [0, 1], '--')
    axs[i].set_title(f"{title[i]} - ({round(logit_roc_aut, 4)})")
plt.show()

In [None]:
y_test.tolist().count(1)

In [None]:
df = pd.read_csv("../data/scoring_case.csv")

for column in df.columns:
    if str(df[column].dtype) == 'object':
        df[column] = pd.factorize(df[column])[0]

df.dropna(axis=1, thresh=int(0.8 * df.shape[0]), inplace=True)
df = df.drop(["FLAG_MOBIL", "SK_ID_CURR"], axis=1)

for column in df.columns:
    if column == 'TARGET':
        continue

    df[column] = df[column].fillna(round(df[column].mode()[0]))
    
df = df[df["TARGET"].isna()]

x = df.drop(["TARGET"], axis=1) 
y = df["TARGET"]

prediction = model.predict_proba(x)[:, [1]]
prediction_df = pd.DataFrame(prediction, columns=["TARGET"])
prediction_df.to_csv("prediction.csv", index=False)