# Решение соревнования

## Импорт библиотек

In [67]:
import pandas as pd
import numpy as np

from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

## Загрузка данных

In [6]:
path2file = '../datasets/train_preprocessed.csv'
df = pd.read_csv(path2file, index_col='Unnamed: 0')

In [21]:
df = df.drop(labels=['Product_ID'], axis=1)

In [22]:
df

Unnamed: 0,Gender,Age,City_Category,Stay_In_Current_City_Years,Marital_Status,Purchase,Occupation_0,Occupation_1,Occupation_2,Occupation_3,...,Category_11,Category_12,Category_13,Category_14,Category_15,Category_16,Category_17,Category_18,Category_19,Category_20
0,1,1,1,1,0,8370,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,0,15200,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,0,1422,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,1,1,1,1,0,1057,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,7969,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550063,0,1,1,1,1,368,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
550064,1,1,0,1,0,371,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
550065,1,1,1,0,1,137,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
550066,1,0,0,1,0,365,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


## Разбиение данных

In [23]:
df = df.sample(frac=0.7).reset_index(drop=True)

In [48]:
X = df.iloc[:, list(range(0, 5)) + list(range(6, 47))]
y = df.iloc[:, 5]

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 

## Первый вариант – ElasticNet

In [68]:
params = {
    'l1_ratio': [0.4, 0.5, 0.6, 0.8, 0.9],
    'max_iter': [800, 900, 1000, 1300]
}

en = ElasticNet()

en_cv = GridSearchCV(en, params, verbose=True)

In [69]:
%%time
en_cv.fit(X_train, y_train)
en_cv.best_params_
y_pred = en_cv.predict(X_test)
print('Root Mean Square Error =', np.sqrt(mean_squared_error(y_test, y_pred)))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Root Mean Square Error = 3935.3078199735373
Wall time: 35 s


## Второй вариант – DecisionTreeRegressor

In [70]:
tree = DecisionTreeRegressor()
params = {
    'min_samples_split': [1, 2, 3, 4],
    'max_depth': [5, 25, 50]
}

In [73]:
%%time
tree_cv = GridSearchCV(tree, params, verbose=True)
tree_cv.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Wall time: 1min 6s


GridSearchCV(estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [5, 25, 50],
                         'min_samples_split': [1, 2, 3, 4]},
             verbose=True)

In [74]:
tree_cv.best_params_
y_pred = tree_cv.predict(X_test)
print('Root Mean Square Error =', np.sqrt(mean_squared_error(y_test, y_pred)))

Root Mean Square Error = 3011.1036085049964


**Вывод**: по метрике RMSE лучше оказалась модель DecisionTreeRegressor

## Сохранение ответа

In [75]:
test_df = pd.read_csv('../datasets/test_preprocessed.csv', index_col='Unnamed: 0')
sample_sub = pd.read_csv('../datasets/sample_submission.csv')

In [76]:
test_df = test_df.drop(labels=['Product_ID'], axis=1)

In [77]:
predictions = tree_cv.predict(test_df)

In [78]:
sample_sub['Purchase'] = predictions

In [79]:
path2file = '../datasets/submission.csv'
sample_sub.to_csv(path2file)