In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, ARDRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, roc_auc_score, r2_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.manifold import TSNE
from xgboost import XGBRegressor

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df_train = pd.read_excel('train.xlsx')
df_test = pd.read_excel('test.xlsx')

## Preprocess

In [None]:
df_test_drp = df_test.drop(columns=['Choline_Tot_ (mg)','Alpha_Carot_(µg)', 'Beta_Carot_(µg)',
                                      'Beta_Crypt_(µg)','Lycopene_(µg)','Lut+Zea_ (µg)','Vit_E_(mg)',
                                      'Vit_D_µg','Vit_D_IU','Vit_K_(µg)','GmWt_2','GmWt_Desc2'])
df_train_drp = df_train.drop(columns=['Choline_Tot_ (mg)','Alpha_Carot_(µg)', 'Beta_Carot_(µg)',
                                      'Beta_Crypt_(µg)','Lycopene_(µg)','Lut+Zea_ (µg)','Vit_E_(mg)',
                                      'Vit_D_µg','Vit_D_IU','Vit_K_(µg)','GmWt_2','GmWt_Desc2'])

In [None]:
df_train_fixed = df_train_drp.select_dtypes(include=['number']).apply(lambda x : x.fillna(x.mean()))
df_test_fixed = df_test_drp.select_dtypes(include=['number']).apply(lambda x : x.fillna(x.mean()))

In [None]:
df_test_fixed_0 = df_test_drp.select_dtypes(include=['number']).fillna(0)
df_train_fixed_0 = df_train_drp.select_dtypes(include=['number']).fillna(0)

## Дополнительная задача No2 (Регрессия)
Построить модель машинного обучения для предсказания калорийности продукта (Kcal) используя только
* белки
* жиры
* углеводы
* Названия продукта (признаки на основе поля shrt_desc)

In [None]:
df_train_2 = df_train[['Protein_(g)','Lipid_Tot_(g)','Carbohydrt_(g)', 'Shrt_Desc', 'Energ_Kcal']]
df_test_2 = df_test[['Protein_(g)','Lipid_Tot_(g)','Carbohydrt_(g)', 'Shrt_Desc']]

In [None]:
X = df_train_2.drop(columns=['Energ_Kcal', 'Shrt_Desc'])
y = df_train_2['Energ_Kcal']
df_test_2 = df_test_2.drop('Shrt_Desc', axis=1)

In [None]:
xgb_clf = XGBRegressor(n_estimators = 900, min_child_weight = 1, max_depth = 15, learning_rate= 0.1,
                       booster = 'gbtree', base_score = 1, n_jobs=-1)
xgb_clf.fit(X, y)
y_pred = xgb_clf.predict(df_test_2)
pd.DataFrame({'Pred_kcal' : pd.Series(y_pred)}).to_csv('Pred_extra_2.csv')

## Experiments 2

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [None]:
lr_clf = LinearRegression(normalize=True)
lr_clf.fit(X_train, y_train)
y_pred = lr_clf.predict(X_train)
mean_absolute_error(y_train, y_pred)

In [None]:
xgb_clf = XGBRegressor(n_estimators = 900, min_child_weight = 1, max_depth = 15, learning_rate= 0.1,
                       booster = 'gbtree', base_score = 1, n_jobs=-1)
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)
mean_absolute_error(y_test, y_pred)

In [None]:
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)
mean_absolute_error(y_test, y_pred)

In [None]:
s = 0
xgb_clf = XGBRegressor(n_estimators = 900, min_child_weight = 1, max_depth = 15, learning_rate= 0.1,
                       booster = 'gbtree', base_score = 1, n_jobs=-1)
for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
    xgb_clf.fit(X_train, y_train)
    y_pred = xgb_clf.predict(X_test)
    m = mean_absolute_error(y_test, y_pred)
    print(m)
    s+=m
    print()
print('s=',s/10)

In [None]:
n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster=['gbtree','gblinear']
learning_rate=[0.05,0.1,0.15,0.20]
min_child_weight=[1,2,3,4]
base_score=[0.25,0.5,0.75,1]



# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'booster':booster,
    'base_score':base_score
    }

In [None]:
xgb_clf = XGBRegressor()
random_cv = RandomizedSearchCV(estimator=xgb_clf,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'neg_mean_absolute_error',n_jobs = -1,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

In [None]:
random_cv.fit(X, y)

In [None]:
random_cv.best_params_