In [None]:
!pip install sklearn_relief

In [None]:
!apt install -y build-essential swig curl
!curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install
!pip install auto-sklearn

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn_relief as sr
import os

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import power_transform
from scipy.stats import skew, skewtest
from scipy.stats import kurtosis, kurtosistest
from autosklearn.regression import AutoSklearnRegressor
from sklearn.model_selection import train_test_split
from pandas.plotting import autocorrelation_plot

from autosklearn.metrics import mean_absolute_error as auto_mean_absolute_error
from easymetrics import r2_all
from easymetrics import mae_all
from easymetrics import rmsle_all

In [None]:
def plot_r2_mae_rmsle(data):
    X = np.arange(3)
    fig = plt.figure(figsize=(12, 6))
    ax = fig.add_axes([0,0,1,1])
    ax.bar(X + 0.00, data[0], color = 'b', width = 0.25)
    ax.bar(X + 0.25, data[1], color = 'r', width = 0.25)
    ax.bar(X + 0.50, data[2], color = 'g', width = 0.25)
    ax.set_ylabel("Values")
    ax.set_title("Metrics")
    ax.set_xticks(X + 0.20 / 2)
    ax.set_xticklabels(('R2', 'MAE', 'RMSLE'))
    ax.legend(labels=['Train', 'Test','Valid'])
    return fig, ax

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
dfNutrition= pd.read_csv('../input/nutrition-research-in-schools/banco-TODOS-processado-ajustado.csv', sep=',')
dfModel = dfNutrition[['school_id','class_year','shift_id','class', 'age_months','sex_id','weight','height','bmi','bazw','income','sportsclassfrequency','sportsclass acceptance','meals_done','schoolfoodacceptance', 'dayseatingatschool']]

dfModel.head(10)

In [None]:
dfModel.info()

In [None]:
dfModel.describe()

In [None]:
print(dfModel.groupby(['school_id']).size().reset_index(name='counts'))

In [None]:
dfModel = dfModel[dfModel["school_id"] == 8]

In [None]:
columns_means = dfModel.mean()
dfModel = dfModel.fillna(columns_means)
dfModel.isnull().sum()

In [None]:
dfModel = dfModel.dropna(axis='columns', how='all')
dfModel.describe()

In [None]:
corrMatrix = dfModel.corr()
fig, ax = plt.subplots(figsize=(20,20)) 
sns.heatmap(corrMatrix, annot=True, ax=ax)
plt.show()

In [None]:
scaler = MinMaxScaler()

dfModel = power_transform(dfModel, method='yeo-johnson')
dfModel0 = scaler.fit_transform(dfModel)

X = dfModel0[:,0:15]
y = dfModel0[:,15]

# (optional) plot train & test
fig, ax=plt.subplots(1,2,figsize=(30, 6))
sns.histplot(X, ax=ax[0], legend = True)
sns.histplot(y, ax=ax[1], legend = True)

print(f"Skewness: {round(np.mean(skew(dfModel, axis = 0)),2)}")
print(f"Kurtosis: {round(np.mean(kurtosis(dfModel, axis = 0)),2)}")

In [None]:
automodel = AutoSklearnRegressor(time_left_for_this_task=5*60, per_run_time_limit=30, n_jobs=8, metric=auto_mean_absolute_error)
r2_valid_best = 0
nof_best = 0
for n in range(1,15):
    r = sr.Relief(n_features = n) 
    Xt = r.fit_transform(X,y)
    X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.33, random_state=1)
    automodel = AutoSklearnRegressor(time_left_for_this_task=5*60, per_run_time_limit=30, n_jobs=8)
    %time automodel.fit(X_train, y_train)
    y_pred = automodel.predict(X_test)
    print(f"Feature: {n}")
    r2_train, r2_test, r2_valid = r2_all(automodel, X_train, y_train, X_test, y_test, y_pred)
    if r2_valid > r2_valid_best:
        r2_valid_best = r2_valid
        automodelm0 = automodel
        nof_best = n
        X_best = Xt
        
print(f"Best Number of Features: {nof_best}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_best, y, test_size=0.33, random_state=40)
# perform the search
%time automodelm0.fit(X_train, y_train)
print(automodelm0.sprint_statistics())
y_pred = automodelm0.predict(X_test)

In [None]:
r2_train, r2_test, r2_valid = r2_all(automodelm0, X_train, y_train, X_test, y_test, y_pred)
mae_train, mae_test, mae_valid = mae_all(automodelm0, X_train, y_train, X_test, y_test, y_pred)
rmsle_train, rmsle_test, rmsle_valid = rmsle_all(automodelm0, X_train, y_train, X_test, y_test, y_pred)

data = [[r2_train, mae_train, rmsle_train],
        [r2_test, mae_test, rmsle_test],
        [r2_valid, mae_valid, rmsle_valid]]

fig, ax = plot_r2_mae_rmsle(data)

In [None]:
dfModelResults = pd.DataFrame()

dfModelResults['Test'] = y_test
dfModelResults['Prediction'] = y_pred

dfModelResults.head(10)