# Обучение модели
### [**scikit-learn**](https://scikit-learn.org/stable/auto_examples/index.html)
#### [train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

In [55]:
from sklearn.model_selection import train_test_split
from feature import Features
import pandas as pd
import numpy as np

In [56]:
SEED = 42

In [57]:
df_train = pd.read_csv('train.csv', low_memory=False)

In [58]:
df_train

Unnamed: 0.1,Unnamed: 0,applicant_ethnicity,income,applicant_race_1,applicant_race_2,applicant_race_3,applicant_race_4,applicant_race_5,applicant_sex,co_applicant_ethnicity,...,preapproval,property_type,purchaser_type,hud_median_family_income,loan_amount,number_of_1_to_4_family_units,number_of_owner_occupied_units,minority_population,population,target
0,0,2,54.0,5.0,,,,,2,4,...,2,3,0,79500,205000.0,1309,568,45.83,2878,True
1,1,2,168.0,5.0,,,,,1,4,...,2,3,71,67500,215000.0,2919,1296,76.18,6071,True
2,2,2,179.0,5.0,,,,,1,2,...,2,3,1,83200,505000.0,826,594,7.54,1339,True
3,3,2,68.0,5.0,,,,,1,2,...,2,3,2,90400,235000.0,1519,920,22.16,3650,True
4,4,3,49.0,5.0,,,,,1,4,...,2,3,1,73500,245000.0,2201,1736,27.89,6797,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1107606,1107606,2,218.0,5.0,,,,,2,2,...,2,3,0,86100,255000.0,1670,1335,36.88,5019,False
1107607,1107607,2,90.0,5.0,,,,,2,4,...,2,3,0,82100,335000.0,2573,1570,57.53,7344,False
1107608,1107608,2,117.0,5.0,,,,,1,2,...,2,3,0,80100,225000.0,3310,496,59.35,7565,False
1107609,1107609,2,37.0,5.0,,,,,1,4,...,2,3,0,90300,215000.0,922,358,81.23,5536,False


In [59]:
features = [Features.APPLICANT_ETHNICITY.value, Features.PURCHASER_TYPE.value]

In [60]:
df = df_train[[*features, 'target']].copy()
df.head(2)

Unnamed: 0,applicant_ethnicity,purchaser_type,target
0,2,0,True
1,2,71,True


In [61]:
X = df[features]
y = df.target

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=SEED)

In [63]:
print('Размер тренировочной выборки:', len(X_train))

# при частом определении кол-ва строк в датафрейме эффективнее вычислять длину от индекса
print('Размер тестовой выборки:', len(X_test.index))

Размер тренировочной выборки: 886088
Размер тестовой выборки: 221523


#### [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [64]:
from sklearn.linear_model import LogisticRegression

In [65]:
model = LogisticRegression().fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
y_pred[:10]

array([0.56044798, 0.42439854, 0.42439854, 0.42439854, 0.56044798,
       0.42439854, 0.1900346 , 0.99999682, 0.42439854, 0.42439854])

### [ROC AUC](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html)

In [66]:
from sklearn.metrics import auc, roc_auc_score, roc_curve
import plotly.express as px

In [67]:
fpr, tpr, _ = roc_curve(y_test, y_pred)
# False Positive Rate - доля ложно положительных ответов
# True Positive Rate - доля истинно положительных ответов

# закрашенная область
fig = px.area(
    x=fpr,  # значения оси x
    y=tpr,  # значения оси y
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',  # название графика
    labels=dict(x='False Positive Rate', y='True Positive Rate'),  # названия осей
    width=700,  # ширина
    height=500,  # высота
)

# пунктирная линия
fig.add_shape(
    type='line',  # тип объекта
    line=dict(dash='dash'),  # характеристики линии
    x0=0, x1=1, y0=0, y1=1,  # координаты - откуда и куда рисовать
)

# https://plotly.com/python/axes/#fixed-ratio-axes
fig.update_yaxes(scaleanchor='x', scaleratio=1)
fig.update_xaxes(constrain='domain')

fig.show()

## Прогноз

In [69]:
df_test = pd.read_csv('test.csv', index_col=0)
df_test.head(2)

Unnamed: 0,applicant_ethnicity,income,applicant_race_1,applicant_race_2,applicant_race_3,applicant_race_4,applicant_race_5,applicant_sex,co_applicant_ethnicity,co_applicant_race_1,...,msamd,preapproval,property_type,purchaser_type,hud_median_family_income,loan_amount,number_of_1_to_4_family_units,number_of_owner_occupied_units,minority_population,population
0,2,90.0,5.0,,,,,2,4,8.0,...,33124,2,3,0,68300,55000.0,889,866,94.42,3282
1,2,77.0,5.0,,,,,2,2,5.0,...,39460,2,3,0,76000,15000.0,2428,1828,19.71,6031


In [70]:
df_test[features].count()

applicant_ethnicity    276903
purchaser_type         276903
dtype: int64

In [71]:
df_test.shape

(276903, 27)

In [73]:
df_test['target'] = model.predict_proba(df_test[features])[:, 1]
df_test.target.head(2)

0    0.424399
1    0.424399
Name: target, dtype: float64

In [None]:
df_test.target.to_csv('submission.csv')

In [None]:
!zip submission.zip submission.csv