In [1]:
import joblib
import json

from IPython.display import display
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from sqlalchemy import create_engine, text

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import GridSearchCV

from sklearn.tree import plot_tree

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [2]:
with open('../constants.json') as CONSTANTS_FILE:
    CONSTANTS = json.load(CONSTANTS_FILE)
    
DB_ENGINE = create_engine(f'postgresql+psycopg2://{CONSTANTS["DB_USER"]}:{CONSTANTS["DB_PASSWORD"]}@{CONSTANTS["DB_IP"]}:{CONSTANTS["DB_PORT"]}/{CONSTANTS["DB_NAME"]}', future=True)
CONN = DB_ENGINE.connect()

In [3]:
DATA = pd.read_sql(text('SELECT * FROM wines'), CONN)
Y_COLUMN_NAME = 'price'

In [4]:
DATA

Unnamed: 0,id,name,is_red,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality,price
0,0,White Claret,0,7.0,0.17,0.74,12.80,0.045,24.0,126.0,0.99420,3.26,0.38,12.2,8,629.79
1,1,Red Boal or Bual,1,7.7,0.64,0.21,2.20,0.077,32.0,133.0,0.99560,3.27,0.45,9.9,5,665.47
2,2,White Fumé Blanc,0,6.8,0.39,0.34,7.40,0.020,38.0,133.0,0.99212,3.18,0.44,12.0,7,483.00
3,3,White Trebbiano,0,6.3,0.28,0.47,11.20,0.040,61.0,183.0,0.99592,3.12,0.51,9.5,6,440.72
4,4,White Pinot Blanc,0,7.4,0.35,0.20,13.90,0.054,63.0,229.0,0.99888,3.11,0.50,8.9,6,458.45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6492,White Sherry,0,7.6,0.32,0.58,16.75,0.050,43.0,163.0,0.99990,3.15,0.54,9.2,5,394.45
6493,6493,White Sauterns,0,5.6,0.28,0.27,3.90,0.043,52.0,158.0,0.99202,3.35,0.44,10.7,7,480.52
6494,6494,White Tokay,0,6.4,0.37,0.20,5.60,0.117,61.0,183.0,0.99459,3.24,0.43,9.5,5,339.10
6495,6495,White Müller-Thurgau,0,6.5,0.26,0.50,8.00,0.051,46.0,197.0,0.99536,3.18,0.47,9.5,5,397.43


In [5]:
RAW_NUMERICAL_COLUMN_NAMES = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'ph', 'sulphates', 'alcohol']
RAW_CATEGORICAL_COLUMN_NAMES = ['is_red']

In [6]:
%%time
print("Correlacao de cada coluna numerica com Y\n")
to_use_numerical_columns = []

if True:
    TRANFORMER = Normalizer()
    NUM_COLUMNS = TRANFORMER.fit_transform(DATA[RAW_NUMERICAL_COLUMN_NAMES])
    CORR = pd.concat([pd.DataFrame(NUM_COLUMNS, columns=RAW_NUMERICAL_COLUMN_NAMES), DATA[Y_COLUMN_NAME]], axis=1).corr()[Y_COLUMN_NAME].abs().sort_values(ascending=False)
    print(CORR[CORR>0.05])
    to_use_numerical_columns = CORR[CORR>0.05].index.to_list()
    to_use_numerical_columns.remove(Y_COLUMN_NAME)
    print()
    print(f'Total = {len(to_use_numerical_columns)} colunas numericas')
    print()

Correlacao de cada coluna numerica com Y

price                   1.000000
sulphates               0.677846
alcohol                 0.630777
fixed_acidity           0.627671
ph                      0.622581
density                 0.619801
total_sulfur_dioxide    0.591939
chlorides               0.559533
volatile_acidity        0.536301
citric_acid             0.466180
free_sulfur_dioxide     0.339917
residual_sugar          0.184567
Name: price, dtype: float64

Total = 11 colunas numericas

CPU times: total: 15.6 ms
Wall time: 14 ms


In [7]:
%%time
print("Correlacao de cada coluna categorica com Y\n")
to_use_categorical_columns = []

if True:
    TRANFORMER = OneHotEncoder(drop='first', sparse=False)
    CAT_COLUMNS = TRANFORMER.fit_transform(DATA[RAW_CATEGORICAL_COLUMN_NAMES])
    CORR = pd.concat([pd.DataFrame(CAT_COLUMNS, columns=RAW_CATEGORICAL_COLUMN_NAMES), DATA[Y_COLUMN_NAME]], axis=1).corr()[Y_COLUMN_NAME].abs().sort_values(ascending=False)
    print(CORR[CORR>0.05])
    to_use_categorical_columns = CORR[CORR>0.05].index.to_list()
    to_use_categorical_columns.remove(Y_COLUMN_NAME)
    print()
    print(f'Total = {len(to_use_categorical_columns)} colunas categoricas')
    print()

Correlacao de cada coluna categorica com Y

price     1.000000
is_red    0.806658
Name: price, dtype: float64

Total = 1 colunas categoricas

CPU times: total: 15.6 ms
Wall time: 9.01 ms


In [8]:
to_use_numerical_column_numbers = []
for COLUMN_NAME in to_use_numerical_columns:
    to_use_numerical_column_numbers.append(DATA.columns.to_list().index(COLUMN_NAME))

In [9]:
to_use_categorical_column_numbers = []
for COLUMN_NAME in to_use_categorical_columns:
    to_use_categorical_column_numbers.append(DATA.columns.to_list().index(COLUMN_NAME))

In [10]:
PRE_PROCESSOR =  ColumnTransformer([('normalize', Normalizer(), to_use_numerical_column_numbers),
                                 ('pass', OneHotEncoder(), to_use_categorical_column_numbers)])

In [11]:
DATA_X = PRE_PROCESSOR.fit_transform(DATA.to_numpy())
DATA_Y = DATA['price'].to_numpy()

In [17]:
%%time
if True:
    HP_TUNNER = GridSearchCV(LinearRegression(), {'fit_intercept':[False, True], 'normalize':[False, True]}, n_jobs=-1)
    HP_TUNNER.fit(DATA_X, DATA_Y)
    
    print('---LinearRegression---')
    print(f'Best params = {HP_TUNNER.best_params_}')
    print()
    display(pd.DataFrame(HP_TUNNER.cv_results_).sort_values(by='rank_test_score', ignore_index=True).drop(columns=['params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']).head())
    print()

---LinearRegression---
Best params = {'fit_intercept': True, 'normalize': False}





Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_fit_intercept,param_normalize,mean_test_score,std_test_score,rank_test_score
0,0.011197,0.004962,0.001196,0.000403,True,False,0.727514,0.015702,1
1,0.007998,0.001265,0.0012,0.000402,False,False,0.727511,0.015711,2
2,0.011001,0.003348,0.001601,0.000797,False,True,0.727511,0.015711,2
3,0.011196,0.002135,0.0006,0.00049,True,True,0.727511,0.015711,2



CPU times: total: 62.5 ms
Wall time: 107 ms


In [25]:
%%time
if True:
    HP_TUNNER = GridSearchCV(DecisionTreeRegressor(random_state=RANDOM_SEED), {'max_depth':[4, 5, 6], 'criterion':['squared_error', 'friedman_mse', 'absolute_error'], 'splitter':['best']}, n_jobs=-1)
    HP_TUNNER.fit(DATA_X, DATA_Y)
    
    print('---DecisionTreeRegressor---')
    print(f'Best params = {HP_TUNNER.best_params_}')
    print()
    display(pd.DataFrame(HP_TUNNER.cv_results_).sort_values(by='rank_test_score', ignore_index=True).drop(columns=['params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']).head())
    print()

---DecisionTreeRegressor---
Best params = {'criterion': 'absolute_error', 'max_depth': 5, 'splitter': 'best'}



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_splitter,mean_test_score,std_test_score,rank_test_score
0,3.091803,0.157551,0.001402,0.0004907421,absolute_error,5,best,0.702476,0.014813,1
1,0.060597,0.004225,0.001002,6.743496e-07,friedman_mse,6,best,0.699496,0.013494,2
2,0.066199,0.009705,0.001201,0.000401407,squared_error,6,best,0.698897,0.013909,3
3,0.053204,0.005599,0.001194,0.000395548,squared_error,5,best,0.698697,0.014187,4
4,0.051395,0.002245,0.001401,0.001357366,friedman_mse,5,best,0.698697,0.014187,4



CPU times: total: 2.8 s
Wall time: 9.64 s


In [26]:
%%time
if True:
    HP_TUNNER = GridSearchCV(RandomForestRegressor(random_state=RANDOM_SEED), {'max_depth':[45, 50, 55], 'criterion' : ['squared_error'], 'n_estimators' : [200, 300]}, n_jobs=-1)
    HP_TUNNER.fit(DATA_X, DATA_Y)
    
    print('---RandomForestRegressor---')
    print(f'Best params = {HP_TUNNER.best_params_}')
    print()
    display(pd.DataFrame(HP_TUNNER.cv_results_).sort_values(by='rank_test_score', ignore_index=True).drop(columns=['params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']).head())
    print()

---RandomForestRegressor---
Best params = {'criterion': 'squared_error', 'max_depth': 45, 'n_estimators': 300}



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,mean_test_score,std_test_score,rank_test_score
0,28.403236,0.576931,0.208801,0.016724,squared_error,45,300,0.784392,0.010587,1
1,28.376033,0.838584,0.201202,0.019944,squared_error,50,300,0.784391,0.010585,2
2,24.295032,0.327797,0.160212,0.017821,squared_error,55,300,0.784391,0.010585,2
3,19.63342,0.287985,0.127,0.012648,squared_error,45,200,0.784083,0.010237,4
4,18.29802,0.314856,0.1288,0.011702,squared_error,50,200,0.78408,0.010235,5



CPU times: total: 20.4 s
Wall time: 1min 52s


In [27]:
%%time
if True:
    HP_TUNNER = GridSearchCV(MLPRegressor(random_state=RANDOM_SEED, max_iter=500), {'solver':['lbfgs'], 'alpha': 10.0 ** -np.arange(1, 7)}, n_jobs=-1)
    HP_TUNNER.fit(DATA_X, DATA_Y)
    
    print('---MLPRegressor---')
    print(f'Best params = {HP_TUNNER.best_params_}')
    print()
    display(pd.DataFrame(HP_TUNNER.cv_results_).sort_values(by='rank_test_score', ignore_index=True).drop(columns=['params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']).head())
    print()

---MLPRegressor---
Best params = {'alpha': 1e-05, 'solver': 'lbfgs'}



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_solver,mean_test_score,std_test_score,rank_test_score
0,16.864425,1.844705,0.004201,0.001471,1e-05,lbfgs,0.738651,0.014421,1
1,18.310026,0.735989,0.004801,0.000983,0.1,lbfgs,0.737149,0.014575,2
2,19.174032,1.055785,0.008199,0.006937,0.0001,lbfgs,0.737064,0.012999,3
3,18.020229,1.147559,0.006596,0.004271,0.001,lbfgs,0.736882,0.013051,4
4,14.157227,0.611223,0.003195,0.00075,1e-06,lbfgs,0.736043,0.014261,5



CPU times: total: 20.6 s
Wall time: 1min 20s


In [12]:
%%time
if True:
    PIPE = Pipeline([
    ('transform', ColumnTransformer([('normalize', Normalizer(), to_use_numerical_column_numbers),
                                     ('pass', OneHotEncoder(), to_use_categorical_column_numbers)])),
    ('estimator', RandomForestRegressor(criterion='squared_error', max_depth=45, n_estimators=300))
    ])
    PIPE.fit(DATA.to_numpy(), DATA[Y_COLUMN_NAME].to_numpy())
    joblib.dump(PIPE, '../models/price_model.joblib', compress=3)

CPU times: total: 22.1 s
Wall time: 22.2 s


In [21]:
CONN.close()
DB_ENGINE.dispose()