In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor,VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor

In [None]:
url_train = 'https://github.com/palaciosleo/diplodatos-famaf/raw/master/mentoria/models/psp_train_{k}.pkl'
url_eval = 'https://github.com/palaciosleo/diplodatos-famaf/raw/master/mentoria/models/psp_eval.pkl'

In [None]:
model_scores = dict()

#regressor = DecisionTreeRegressor(max_depth=7, max_features='auto',min_samples_split=5,min_samples_leaf=5)
#regressor = XGBRegressor(learning_rate=0.1, max_depth=7)
regressor = KNeighborsRegressor(n_neighbors=3, weights='distance')
for k in range(0, 15):
    model = pd.read_pickle(url_train.format(k=k), compression='zip')
    model.drop(columns=['um_cc', 'um_gr', 'um_kg', 'um_lt', 'um_ml', 'um_mt', 'um_pack', 'um_un'], inplace=True)
    model.drop(columns=['fecha_20200412', 'fecha_20200419', 'fecha_20200426', 'fecha_20200502', 'fecha_20200518'], inplace=True)
    model.drop(columns=['suctipo_autoservicio', 'suctipo_hipermercado', 'suctipo_minorista', 'suctipo_supermercado'], inplace=True)

    y = model['precio_relativo']
    X = model.drop(columns=['precio_relativo'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    regressor.fit(X_train, y_train)
    
    x_pred = regressor.predict(X_train)
    y_pred = regressor.predict(X_test)
    
    train_score = round(r2_score(y_train, x_pred), 3)
    test_score = round(r2_score(y_test, y_pred), 3)
    
    print(str(k), ' - train:', train_score, ' - test:', test_score)

In [None]:
eval_model = pd.read_pickle(url_eval, compression='zip')
eval_model.drop(columns=['um_cc', 'um_gr', 'um_kg', 'um_lt', 'um_ml', 'um_mt', 'um_pack', 'um_un'], inplace=True)
eval_model.drop(columns=['fecha_20200412', 'fecha_20200419', 'fecha_20200426', 'fecha_20200502', 'fecha_20200518'], inplace=True)
eval_model.drop(columns=['suctipo_autoservicio', 'suctipo_hipermercado', 'suctipo_minorista', 'suctipo_supermercado'], inplace=True)

y_eval = eval_model['precio_relativo']
X_eval = eval_model.drop(columns=['precio_relativo'])

y_pred = regressor.predict(X_eval)
print('test:', round(r2_score(y_eval, y_pred), 3))

In [9]:
url = '../models/precio_sucursal_producto_400.pkl'
dataset = pd.read_pickle(url, compression='zip')

In [None]:
def get_dataset_ready(dataset, muestras=10000):
    _ds_shuff = shuffle(dataset)
    X_full = _ds_shuff.iloc[:,:-1]
    y_full = _ds_shuff.iloc[:,-1]
    del _ds_shuff
    X_full['precio_producto_mean'] = MinMaxScaler().fit_transform(X_full[['precio_producto_mean']])
    y_full['precio_producto_mean'] = MinMaxScaler().fit_transform(y_full[['precio_producto_mean']])
    X = X_full.iloc[:muestras,:].to_numpy()
    y = y_full.iloc[:muestras].to_numpy()

    del X_full
    del y_full 

    return train_test_split(X, y, test_size=0.20, random_state=42)

In [10]:
def get_dataset_ready_2(dataset, muestras=10000):
    _ds_shuff = shuffle(dataset)
    X_full = _ds_shuff.iloc[:,:-1]
    y_full = _ds_shuff.iloc[:,-1]
    del _ds_shuff
    
    X = X_full.iloc[:muestras,:]#.to_numpy()
    y = y_full.iloc[:muestras]#.to_numpy()

    del X_full
    del y_full 

    return train_test_split(X, y, test_size=0.20, random_state=42)


In [11]:
def get_scores(labels, predictions):
    mae = mean_absolute_error(labels, predictions)
    rmse = np.sqrt(mean_squared_error(labels, predictions))
    r2 = r2_score(labels, predictions)
    
    return {'mae':mae, 'rmse':rmse, 'r2':r2}

In [None]:
# Seleccion de Columnas
## DROP UM
#dataset.drop(columns=['um_cc'], inplace=True) # Dummy trap!
dataset.drop(columns=['um_cc', 'um_gr', 'um_kg', 'um_lt', 'um_ml', 'um_mt', 'um_pack', 'um_un'], inplace=True)


## DROP FECHA
#dataset.drop(columns=['fecha_20200412'], inplace=True) # Dummy trap!
dataset.drop(columns=['fecha_20200412', 'fecha_20200419', 'fecha_20200426', 'fecha_20200502', 'fecha_20200518'], inplace=True)


## DROP SUCURSAL TIPO
#dataset.drop(columns=['suctipo_autoservicio'], inplace=True) # Dummy trap!
dataset.drop(columns=['suctipo_autoservicio', 'suctipo_hipermercado','suctipo_minorista', 'suctipo_supermercado'], inplace=True)


## DROP BANDERA DESCRIPCION
#dataset.drop(columns=['banddesc_axion_energy', 'banddesc_changomas', 'banddesc_cooperativa_obrera_limitada_de_consumo_y_vivienda', 'banddesc_coto_cicsa', 'banddesc_deheza_saicf_e_i', 'banddesc_disco',
#       'banddesc_express', 'banddesc_hipermercado_carrefour', 'banddesc_la_anonima', 'banddesc_market', 'banddesc_otras_bandDesc', 'banddesc_simplicity', 'banddesc_supermercados_cordiez',
#       'banddesc_supermercados_dia', 'banddesc_vea', 'banddesc_walmart_supercenter'], inplace=True)

In [12]:
X_train, X_test, y_train, y_test = get_dataset_ready_2(dataset, 1500000)
del dataset

In [13]:
X_train.drop(columns=['um_cc', 'um_gr', 'um_kg', 'um_lt', 'um_ml', 'um_mt', 'um_pack', 'um_un'], inplace=True)
X_train.drop(columns=['fecha_20200412', 'fecha_20200419', 'fecha_20200426', 'fecha_20200502', 'fecha_20200518'], inplace=True)
X_train.drop(columns=['suctipo_autoservicio', 'suctipo_hipermercado','suctipo_minorista', 'suctipo_supermercado'], inplace=True)

X_test.drop(columns=['um_cc', 'um_gr', 'um_kg', 'um_lt', 'um_ml', 'um_mt', 'um_pack', 'um_un'], inplace=True)
X_test.drop(columns=['fecha_20200412', 'fecha_20200419', 'fecha_20200426', 'fecha_20200502', 'fecha_20200518'], inplace=True)
X_test.drop(columns=['suctipo_autoservicio', 'suctipo_hipermercado','suctipo_minorista', 'suctipo_supermercado'], inplace=True)

In [14]:
X_train['precio_producto_mean'] = MinMaxScaler().fit_transform(X_train[['precio_producto_mean']])

In [15]:
X_test['precio_producto_mean'] = MinMaxScaler().fit_transform(X_test[['precio_producto_mean']])

In [21]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [22]:
y_train_pred = lr.predict(X_train)
y_pred = lr.predict(X_test)

print(get_scores(y_train, y_train_pred))
print(get_scores(y_test, y_pred))


{'mae': 3.890351685921863, 'rmse': 14.773176944426236, 'r2': 0.20119784551421804}
{'mae': 3.8911945763137634, 'rmse': 13.66521170226754, 'r2': 0.2286650880223392}


In [16]:
from sklearn.linear_model import SGDRegressor

In [17]:
sgd = SGDRegressor()
sgd.fit(X_train, y_train)

SGDRegressor()

In [18]:
sgd.n_iter_

146

In [19]:
sgd.t_

175200001.0

In [20]:
y_train_pred = sgd.predict(X_train)
y_pred = sgd.predict(X_test)

print(get_scores(y_train, y_train_pred))
print(get_scores(y_test, y_pred))

{'mae': 3.8664559849242237, 'rmse': 14.834219183269473, 'r2': 0.19458296358831595}
{'mae': 3.86918480021104, 'rmse': 13.728301435376174, 'r2': 0.2215264282609981}


In [23]:
from sklearn.linear_model import Ridge
ridge = Ridge()

ridge.fit(X_train, y_train)

Ridge()

In [24]:
y_train_pred = ridge.predict(X_train)
y_pred = ridge.predict(X_test)

print(get_scores(y_train, y_train_pred))
print(get_scores(y_test, y_pred))

{'mae': 3.8885707186705813, 'rmse': 14.773192494317383, 'r2': 0.20119616391341755}
{'mae': 3.8893517211816904, 'rmse': 13.664968929315009, 'r2': 0.22869249448989337}


In [25]:
from sklearn.linear_model import Lasso
lasso = Lasso()

lasso.fit(X_train, y_train)

Lasso()

In [26]:
y_train_pred = lasso.predict(X_train)
y_pred = lasso.predict(X_test)

print(get_scores(y_train, y_train_pred))
print(get_scores(y_test, y_pred))

{'mae': 4.489235450165141, 'rmse': 16.529293293732223, 'r2': 0.0}
{'mae': 4.493877840133395, 'rmse': 15.559472588105974, 'r2': -9.246133636509057e-09}
