# Обучение модели
### [**scikit-learn**](https://scikit-learn.org/stable/auto_examples/index.html)
#### [train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

In [None]:
import pandas as pd
import numpy as np

In [None]:
df_train = pd.read_csv('post_train.csv', low_memory=False)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
features = ['msamd','property_type','purchaser_type','hud_median_family_income','population','applicant_ethnicity','co_applicant_ethnicity','income']
#features  =  df_train.columns.drop('target')

In [None]:
df_notna = df_train[[*features, 'target']].copy()
df_notna

In [None]:
df_notna.income.min()

In [None]:
df_notna.count()

In [None]:
income_median = df_notna.income.median()
income_median

In [None]:
df_notna['income'] = df_notna.income.fillna(income_median)
df_notna.head(2)

In [None]:
df_notna.count()

In [None]:
X = df_notna[features]
y = df_notna.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6)

In [None]:
print('Размер тренировочной выборки:', len(X_train))

# при частом определении кол-ва строк в датафрейме эффективнее вычислять длину от индекса
print('Размер тестовой выборки:', len(X_test.index))

TreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text

In [None]:
model = DecisionTreeClassifier(max_depth=11).fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
y_pred[:10]


In [None]:
feature_importances_entropy = model.feature_importances_

# Выведите важность каждого признака
for i, feature in enumerate(features):
    print(f'Importance of {feature} based on entropy: {feature_importances_entropy[i]}')

In [None]:
from sklearn.ensemble import GradientBoostingClassifier



In [None]:
model = GradientBoostingClassifier(n_estimators=50, learning_rate=0.05,validation_fraction = 0.01, max_depth=4)
model.fit(X_train, y_train)

# Предскажите классы для тестового набора
y_pred = model.predict(X_test)
y_pred[:10]

#### [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression().fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
y_pred[:10]

### [ROC AUC](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html)

In [None]:
from sklearn.metrics import auc, roc_auc_score, roc_curve
import plotly.express as px

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred)
# False Positive Rate - доля ложно положительных ответов
# True Positive Rate - доля истинно положительных ответов

# закрашенная область

fig = px.area(
    x=fpr,  # значения оси x

    y=tpr,  # значения оси y
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',  # название графика
    labels=dict(x='False Positive Rate', y='True Positive Rate'),  # названия осей
    width=700,  # ширина
    height=500,  # высота
)

# пунктирная линия
fig.add_shape(
    type='line',  # тип объекта
    line=dict(dash='dash'),  # характеристики линии
    x0=0, x1=1, y0=0, y1=1,  # координаты - откуда и куда рисовать
)

# https://plotly.com/python/axes/#fixed-ratio-axes
fig.update_yaxes(scaleanchor='x', scaleratio=1)
fig.update_xaxes(constrain='domain')

fig.show()

## Прогноз

In [None]:
df_test = pd.read_csv('test.csv', index_col=0)
df_test.head(2)

In [None]:
df_test['income'] = np.log(df_test.income)
df_test['income'] = df_test.income.fillna(income_median)
df_test.income.replace(-np.inf, 0, inplace=True)
df_test.income.head(2)

In [None]:
df_test[features].count()

In [None]:
df_test.shape

In [None]:
df_test['target'] = model.predict_proba(df_test[features])[:, 1]
df_test.target.head(2)

In [None]:
df_test.target.to_csv('submission.csv')

In [None]:
!zip submission.zip submission.csv