In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

In [7]:
data = pd.read_csv('train.csv')

In [8]:
data['9'] = data['9'].fillna(5)

In [9]:
cols = [str(x) for x in range(0, 9)]
col_values = [x for x in range(0, 6)]

for col in cols:
    for col_value in col_values:
        filtered = data.loc[data['9'] == col_value, col]
        data.loc[data['9'] == col_value, col] = filtered.fillna(filtered.median())

In [10]:
tdf = pd.read_csv('test.csv')
tdf['9'] = tdf['9'].fillna(5)
for col in cols:
    for col_value in col_values:
        filtered = tdf.loc[tdf['9'] == col_value, col]
        tdf.loc[tdf['9'] == col_value, col] = filtered.fillna(filtered.median())

In [13]:
target = data.target
features = data.drop('target', axis=1)

In [14]:
features['9'] = features['9'].apply(lambda x: str(x))
features = pd.get_dummies(features, columns=['9'], drop_first=True)

In [21]:
tdf['9'] = tdf['9'].apply(lambda x: str(x))
tdf = pd.get_dummies(tdf, columns=['9'], drop_first=True)

In [16]:
random_state = 12345
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=random_state)

In [17]:
best_depth = 0
best_f1 = 0
for depth in range(1, 50):
    model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, max_depth=depth)
    model.fit(features_train, target_train)
    predictions_valid = model.predict(features_valid)
    f1 = f1_score(target_valid, predictions_valid, average='macro')
    if f1 > best_f1:
        best_f1 = f1
        best_depth = depth
print(f'Лучшее дерево с глубиной {best_depth} : F1 = {best_f1}')

Лучшее дерево с глубиной 10 : F1 = 0.32096235163181713


In [18]:
best_f1 = 0
best_nestim = 0
for n_estim in range(1, 400, 3):
    print(n_estim, best_f1, best_nestim)
    model = RandomForestClassifier(class_weight='balanced', max_depth=best_depth, random_state=random_state, n_estimators=n_estim)
    model.fit(features_train, target_train)
    predictions_valid = model.predict(features_valid)
    f1 = f1_score(target_valid, predictions_valid, average='macro')
    if f1 > best_f1:
        best_f1 = f1
        best_nestim = n_estim
        best_model = model
print(f'Лучший лес с n_estimators {best_nestim} : F1 = {best_f1}')

1 0 0
4 0.28075637297939254 1
7 0.34162036013101255 4
10 0.35472789198609644 7
13 0.36527160352185206 10
16 0.38007305616334713 13
19 0.3810692508898589 16
22 0.3810692508898589 16
25 0.3810692508898589 16
28 0.3810692508898589 16
31 0.38139242993875866 28
34 0.3823997117647349 31
37 0.3823997117647349 31
40 0.3832273177092536 37
43 0.38376297908337226 40
46 0.38448145002116985 43
49 0.38816853956103853 46
52 0.38816853956103853 46
55 0.38816853956103853 46
58 0.38858954308402854 55
61 0.38858954308402854 55
64 0.3895372889103768 61
67 0.3895372889103768 61
70 0.390278365961816 67
73 0.39222944798502846 70
76 0.39320488354604327 73
79 0.39320488354604327 73
82 0.39320488354604327 73
85 0.3939930330336835 82
88 0.39429886129822467 85
91 0.39429886129822467 85
94 0.39429886129822467 85
97 0.39429886129822467 85
100 0.3957883603714582 97
103 0.3957883603714582 97
106 0.3957883603714582 97
109 0.3957883603714582 97
112 0.3957883603714582 97
115 0.3957883603714582 97
118 0.39747531012040344

In [19]:
for_model = RandomForestClassifier(class_weight='balanced', max_depth=best_depth, random_state=random_state, n_estimators=best_nestim)
for_model.fit(features, target)

RandomForestClassifier(class_weight='balanced', max_depth=10, n_estimators=247,
                       random_state=12345)

In [22]:
sub = pd.read_csv('sample-submission.csv')
predictions_test = for_model.predict(tdf)
sub['target'] = pd.Series(predictions_test)
sub.to_csv('sub.csv')