In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor,VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

In [3]:
url_train = 'https://github.com/palaciosleo/diplodatos-famaf/raw/master/mentoria/models/psp_train_{k}.pkl'
url_eval = 'https://github.com/palaciosleo/diplodatos-famaf/raw/master/mentoria/models/psp_eval.pkl'

In [4]:
model_scores = dict()

regressor = DecisionTreeRegressor()

for k in range(0, 15):
    model = pd.read_pickle(url_train.format(k=k), compression='zip')
    model.drop(columns=['um_cc', 'um_gr', 'um_kg', 'um_lt', 'um_ml', 'um_mt', 'um_pack', 'um_un'], inplace=True)
    model.drop(columns=['fecha_20200412', 'fecha_20200419', 'fecha_20200426', 'fecha_20200502', 'fecha_20200518'], inplace=True)
    model.drop(columns=['suctipo_autoservicio', 'suctipo_hipermercado', 'suctipo_minorista', 'suctipo_supermercado'], inplace=True)

    y = model['precio_relativo']
    X = model.drop(columns=['precio_relativo'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    regressor.fit(X_train, y_train)
    
    x_pred = regressor.predict(X_train)
    y_pred = regressor.predict(X_test)
    
    train_score = round(r2_score(y_train, x_pred), 3)
    test_score = round(r2_score(y_test, y_pred), 3)
    
    print(str(k), ' - train:', train_score, ' - test:', test_score)

0  - train: 0.764  - test: 0.127
1  - train: 0.725  - test: -0.094
2  - train: 0.774  - test: -0.45
3  - train: 0.761  - test: 0.017
4  - train: 0.802  - test: 0.146
5  - train: 0.715  - test: 0.041
6  - train: 0.474  - test: 0.255
7  - train: 0.731  - test: -0.189
8  - train: 0.842  - test: 0.215
9  - train: 0.783  - test: 0.028
10  - train: 0.786  - test: -1.489
11  - train: 0.846  - test: 0.221
12  - train: 0.78  - test: -0.381
13  - train: 0.71  - test: -0.057
14  - train: 0.812  - test: 0.173


In [5]:
eval_model = pd.read_pickle(url_eval, compression='zip')
eval_model.drop(columns=['um_cc', 'um_gr', 'um_kg', 'um_lt', 'um_ml', 'um_mt', 'um_pack', 'um_un'], inplace=True)
eval_model.drop(columns=['fecha_20200412', 'fecha_20200419', 'fecha_20200426', 'fecha_20200502', 'fecha_20200518'], inplace=True)
eval_model.drop(columns=['suctipo_autoservicio', 'suctipo_hipermercado', 'suctipo_minorista', 'suctipo_supermercado'], inplace=True)

y_eval = eval_model['precio_relativo']
X_eval = eval_model.drop(columns=['precio_relativo'])

y_pred = regressor.predict(X_eval)
print('test:', round(r2_score(y_eval, y_pred), 3))

test: -0.069
