# Обучение модели
### [**scikit-learn**](https://scikit-learn.org/stable/auto_examples/index.html)
#### [train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

In [586]:
import pandas as pd
import numpy as np

In [587]:
df_train = pd.read_csv('post_train.csv', low_memory=False)

In [588]:
from sklearn.model_selection import train_test_split

In [589]:
features = ['msamd','property_type','purchaser_type','hud_median_family_income','population','applicant_ethnicity','co_applicant_ethnicity','income','applicant_sex']


In [590]:
df_notna = df_train[[*features, 'target']].copy()
df_notna

Unnamed: 0,msamd,property_type,purchaser_type,hud_median_family_income,population,applicant_ethnicity,co_applicant_ethnicity,income,applicant_sex,target
0,37860,3,0,79500,2878,2,4,54.0,2,1
1,29460,3,71,67500,6071,2,4,168.0,1,1
2,15980,3,1,83200,1339,2,2,179.0,1,1
3,35840,3,2,90400,3650,2,2,68.0,1,1
4,19660,3,1,73500,6797,3,4,49.0,1,1
...,...,...,...,...,...,...,...,...,...,...
1107606,27260,3,0,86100,5019,2,2,218.0,2,0
1107607,45300,3,0,82100,7344,2,4,90.0,2,0
1107608,36740,3,0,80100,7565,2,2,117.0,1,0
1107609,48424,3,0,90300,5536,2,4,37.0,1,0


In [591]:
df_notna.income.min()

-6296.0

In [592]:
df_notna.income.replace(-np.inf, 0, inplace=True)

In [593]:
df_notna.count()

msamd                       1107611
property_type               1107611
purchaser_type              1107611
hud_median_family_income    1107611
population                  1107611
applicant_ethnicity         1107611
co_applicant_ethnicity      1107611
income                      1107611
applicant_sex               1107611
target                      1107611
dtype: int64

In [565]:
#income_median = df_notna.income.median()
#income_median

83.0

In [566]:
#df_notna['income'] = df_notna.income.fillna(income_median)
#df_notna.head(2)

Unnamed: 0,msamd,property_type,purchaser_type,hud_median_family_income,population,applicant_ethnicity,co_applicant_ethnicity,income,applicant_sex,target
0,37860,3,0,79500,2878,2,4,54.0,2,1
1,29460,3,71,67500,6071,2,4,168.0,1,1


In [578]:
#df_notna.count()

msamd                       1107611
property_type               1107611
purchaser_type              1107611
hud_median_family_income    1107611
population                  1107611
applicant_ethnicity         1107611
co_applicant_ethnicity      1107611
income                      1107611
applicant_sex               1107611
target                      1107611
dtype: int64

In [594]:
X = df_notna[features]
y = df_notna.target

In [595]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [596]:
print('Размер тренировочной выборки:', len(X_train))

# при частом определении кол-ва строк в датафрейме эффективнее вычислять длину от индекса
print('Размер тестовой выборки:', len(X_test.index))

Размер тренировочной выборки: 775327
Размер тестовой выборки: 332284


В этом файле реализовано 3 модели которые мы пытались использовать в нашем коде, самый лучший скоре показала TreeClassifier. Каждую из моделей можно запустить и посмотреть какая кривая ROG AUC получается в каждом случае

#### [TreeClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

In [597]:
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text

In [598]:
model = DecisionTreeClassifier(max_depth=11).fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
y_pred[:10]


array([1.95648957e-01, 2.50078430e-01, 1.48148148e-04, 1.27659574e-01,
       1.71681675e-01, 9.36525612e-01, 4.27561837e-01, 9.27000880e-01,
       8.63591996e-01, 9.41578149e-01])

In [599]:
feature_importances_entropy = model.feature_importances_

# Выведите важность каждого признака
for i, feature in enumerate(features):
    print(f'Importance of {feature} based on entropy: {feature_importances_entropy[i]}')

Importance of msamd based on entropy: 0.001871949628748222
Importance of property_type based on entropy: 0.008007097986441433
Importance of purchaser_type based on entropy: 0.6100021587444228
Importance of hud_median_family_income based on entropy: 0.005904653968584888
Importance of population based on entropy: 0.006349548149198496
Importance of applicant_ethnicity based on entropy: 0.00786294613775917
Importance of co_applicant_ethnicity based on entropy: 0.005812546562560835
Importance of income based on entropy: 0.044410673403294375
Importance of applicant_sex based on entropy: 0.30977842541898976


In [452]:
from sklearn.ensemble import GradientBoostingClassifier



In [453]:
model = GradientBoostingClassifier(n_estimators=50, learning_rate=0.05,validation_fraction = 0.01, max_depth=4)
model.fit(X_train, y_train)

# Предскажите классы для тестового набора
y_pred = model.predict(X_test)
y_pred[:10]

array([1, 1, 0, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

#### [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [458]:
from sklearn.linear_model import LogisticRegression

In [459]:
model = LogisticRegression().fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
y_pred[:10]

array([0.50422237, 0.4895337 , 0.42118946, 0.40005299, 0.99998954,
       0.40787522, 0.42367083, 0.41468785, 0.43848288, 0.4117548 ])

### [ROC AUC](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html)

In [600]:
from sklearn.metrics import auc, roc_auc_score, roc_curve
import plotly.express as px

In [601]:
fpr, tpr, _ = roc_curve(y_test, y_pred)
# False Positive Rate - доля ложно положительных ответов
# True Positive Rate - доля истинно положительных ответов

# закрашенная область

fig = px.area(
    x=fpr,  # значения оси x

    y=tpr,  # значения оси y
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',  # название графика
    labels=dict(x='False Positive Rate', y='True Positive Rate'),  # названия осей
    width=700,  # ширина
    height=500,  # высота
)

# пунктирная линия
fig.add_shape(
    type='line',  # тип объекта
    line=dict(dash='dash'),  # характеристики линии
    x0=0, x1=1, y0=0, y1=1,  # координаты - откуда и куда рисовать
)

# https://plotly.com/python/axes/#fixed-ratio-axes
fig.update_yaxes(scaleanchor='x', scaleratio=1)
fig.update_xaxes(constrain='domain')

fig.show()

## Прогноз

In [429]:
df_test = pd.read_csv('test.csv', index_col=0)
df_test.head(2)

Unnamed: 0,applicant_ethnicity,income,applicant_race_1,applicant_race_2,applicant_race_3,applicant_race_4,applicant_race_5,applicant_sex,co_applicant_ethnicity,co_applicant_race_1,...,msamd,preapproval,property_type,purchaser_type,hud_median_family_income,loan_amount,number_of_1_to_4_family_units,number_of_owner_occupied_units,minority_population,population
0,2,90.0,5.0,,,,,2,4,8.0,...,33124,2,3,0,68300,55000.0,889,866,94.42,3282
1,2,77.0,5.0,,,,,2,2,5.0,...,39460,2,3,0,76000,15000.0,2428,1828,19.71,6031


In [430]:
df_test['income'] = np.log(df_test.income)
df_test['income'] = df_test.income.fillna(income_median)
df_test.income.replace(-np.inf, 0, inplace=True)
df_test.income.head(2)


divide by zero encountered in log


invalid value encountered in log



0    4.499810
1    4.343805
Name: income, dtype: float64

In [431]:
df_test[features].count()

msamd                       276903
property_type               276903
purchaser_type              276903
hud_median_family_income    276903
population                  276903
applicant_ethnicity         276903
co_applicant_ethnicity      276903
income                      276903
applicant_sex               276903
dtype: int64

In [432]:
df_test.shape

(276903, 27)

In [433]:
df_test['target'] = model.predict_proba(df_test[features])[:, 1]
df_test.target.head(2)

0    0.131433
1    0.049123
Name: target, dtype: float64

In [434]:
df_test.target.to_csv('submission.csv')

In [435]:
!zip submission.zip submission.csv

"zip" �� ���� ����७��� ��� ���譥�
��������, �ᯮ��塞�� �ணࠬ��� ��� ������ 䠩���.
