In [31]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV

pd.set_option("max_columns", 500)
pd.set_option('display.float_format', lambda x: "{:,.4f}".format(x))

In [32]:
df = pd.read_csv("../data/winequality-red.csv", sep=";")

In [33]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.3196,0.5278,0.271,2.5388,0.0875,15.8749,46.4678,0.9967,3.3111,0.6581,10.423,5.636
std,1.7411,0.1791,0.1948,1.4099,0.0471,10.4602,32.8953,0.0019,0.1544,0.1695,1.0657,0.8076
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.9901,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.9968,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.9978,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.0037,4.01,2.0,14.9,8.0


In [34]:
X = df[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 
        'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',]]
y = df[["quality"]]

### Escalamiento

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [36]:
mms_x = MinMaxScaler()
mms_y = MinMaxScaler()
Xs = mms_x.fit_transform(X_train)
ys = mms_y.fit_transform(y_train)

In [37]:
pd.DataFrame(ys)

Unnamed: 0,0
0,0.6000
1,0.6000
2,0.8000
3,0.4000
4,0.6000
...,...
1194,0.4000
1195,0.6000
1196,1.0000
1197,0.6000


## Modelado 

### Regresión

#### Sin escalamiento

#### Con escalamiento

In [38]:
sgdr = SGDRegressor()
sgdr.fit(X_train, y_train)
ls_medias = cross_val_score(estimator=sgdr, X=X_train, y=y_train, cv = 4, n_jobs=-1, scoring="r2")
np.mean(ls_medias), np.std(ls_medias)

  return f(**kwargs)


(-7.78973210842277e+24, 1.2400196547363334e+25)

In [39]:
sgdr = SGDRegressor()
sgdr.fit(Xs, ys)
ls_medias = cross_val_score(estimator=sgdr, X=Xs, y=ys, cv = 4, n_jobs=-1, scoring="r2")
np.mean(ls_medias), np.std(ls_medias)


  return f(**kwargs)


(0.15491112525953346, 0.034876122310143705)

In [40]:
SGDRegressor?

In [11]:
param_grid = {"loss": ["squared_loss", "huber"],
              "penalty": ["l2", "l1", "elasticnet"],
              "alpha": [x/100 for x in range(100)],
              "epsilon": [x/10 for x in range(10)],
              "learning_rate": ["constant", "invscaling", "adaptive"],
             }

In [41]:
n_hyper = np.prod(list(map(len, param_grid.values())))

In [42]:
n_hyper

60000

In [14]:
model = SGDRegressor(alpha=0.01)
hyper = GridSearchCV(estimator=model, n_jobs=-1, scoring="r2", param_grid=param_grid, error_score=-1000, verbose=5)
hyper.fit(Xs, ys)
hyper.best_score_

Fitting 5 folds for each of 18000 candidates, totalling 90000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 4208 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 12272 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 22640 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 35312 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done 50288 tasks      | elapsed:   41.5s
[Parallel(n_jobs=-1)]: Done 67568 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done 87152 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 89985 out of 90000 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 90000 out of 90000 | elapsed:  1.3min finished
  return f(**kwargs)


0.30613457345939493

In [15]:
hyper.best_estimator_

SGDRegressor(alpha=0.0, epsilon=0.3, learning_rate='adaptive')

In [16]:
mms_y.inverse_transform([hyper.best_estimator_.intercept_])

array([[4.83896736]])

#### Hyperparametrización

##### Sin escalamiento

##### Con escalamiento

In [17]:
%%time
model = SGDRegressor()
hyper = RandomizedSearchCV(estimator=model, n_jobs=-1, scoring="r2", param_distributions=param_grid, error_score=-1000, verbose=1, n_iter=int(n_hyper*.25), random_state=777)
hyper.fit(X_train, y_train)
hyper.best_score_

Fitting 5 folds for each of 4500 candidates, totalling 22500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  59 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1200 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 3200 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 6000 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done 9600 tasks      | elapsed:   26.7s
[Parallel(n_jobs=-1)]: Done 14000 tasks      | elapsed:   38.7s
[Parallel(n_jobs=-1)]: Done 19200 tasks      | elapsed:   53.3s


CPU times: user 32.1 s, sys: 420 ms, total: 32.5 s
Wall time: 1min 3s


[Parallel(n_jobs=-1)]: Done 22500 out of 22500 | elapsed:  1.1min finished
  return f(**kwargs)


0.29311536550460254

In [18]:
%%time
model = SGDRegressor()
hyper = RandomizedSearchCV(estimator=model, n_jobs=-1, scoring="r2", param_distributions=param_grid, error_score=-1000, verbose=1, n_iter=int(n_hyper*.25), random_state=777)
hyper.fit(Xs, ys)
hyper.best_score_

Fitting 5 folds for each of 4500 candidates, totalling 22500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 3824 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 11824 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 22328 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 22500 out of 22500 | elapsed:   23.5s finished


CPU times: user 20.8 s, sys: 200 ms, total: 21 s
Wall time: 23.7 s


  return f(**kwargs)


0.3060466499533462

In [19]:
X_train.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')

In [20]:
hyper.best_estimator_.coef_

array([ 0.11462112, -0.18014553,  0.05665043, -0.01131382, -0.08015929,
        0.02781335, -0.10575609, -0.03543216,  0.07956842,  0.19808386,
        0.38841165])

### Clasificación 

In [43]:
y_train_c = (y_train >= y_train.mean())*1

In [44]:
y_train_c

Unnamed: 0,quality
1530,1
1528,1
366,1
1453,0
1510,1
...,...
417,0
696,1
1403,1
915,1


In [45]:
sgdc = SGDClassifier()
sgdc.fit(X_train, y_train_c)
ls_medias = cross_val_score(estimator=sgdc, X=X_train, y = y_train_c, cv = 4, n_jobs=-1, scoring="roc_auc")
np.mean(ls_medias), np.std(ls_medias)

  return f(**kwargs)


(0.7249725793610968, 0.04721818607003906)

In [46]:
SGDClassifier?

In [47]:
param_grid = {"loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
              "penalty": ["l2", "l1", "elasticnet"],
              "alpha": [x/100+0.01 for x in range(100)],
              "l1_ratio": [x/10 for x in range(10)],
              "learning_rate": ["constant", "optimal", "invscaling", "adaptive"],
              "eta0": [0.001]
             }

In [48]:
n_hyper = np.prod(list(map(len, param_grid.values())))

In [49]:
n_hyper

60000

#### Hyperparametrización

##### Sin escalamiento

##### Con escalamiento

In [28]:
%%time
model = SGDClassifier()
hyper = RandomizedSearchCV(estimator=model, n_jobs=-1, scoring="roc_auc", param_distributions=param_grid, error_score=-1000, verbose=1, n_iter=int(n_hyper*.1), random_state=777, cv = 4)
hyper.fit(X_train, y_train_c)
hyper.best_score_

Fitting 4 folds for each of 6000 candidates, totalling 24000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1200 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 3200 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 6000 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 9600 tasks      | elapsed:   31.7s
[Parallel(n_jobs=-1)]: Done 14000 tasks      | elapsed:   45.8s
[Parallel(n_jobs=-1)]: Done 19200 tasks      | elapsed:  1.0min


CPU times: user 37.1 s, sys: 460 ms, total: 37.5 s
Wall time: 1min 18s


[Parallel(n_jobs=-1)]: Done 24000 out of 24000 | elapsed:  1.3min finished
  return f(**kwargs)


0.7964244456446724

In [29]:
%%time
model = SGDClassifier()
hyper = RandomizedSearchCV(estimator=model, n_jobs=-1, scoring="roc_auc", param_distributions=param_grid, error_score=-1000, verbose=1, n_iter=int(n_hyper*.1), random_state=777, cv=4)
hyper.fit(Xs, y_train_c.values.ravel())
hyper.best_score_

Fitting 4 folds for each of 6000 candidates, totalling 24000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 3824 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 11824 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 23024 tasks      | elapsed:   27.2s


CPU times: user 22.3 s, sys: 156 ms, total: 22.4 s
Wall time: 28.5 s


[Parallel(n_jobs=-1)]: Done 24000 out of 24000 | elapsed:   28.2s finished


0.8099734088365678

In [30]:
hyper.best_estimator_.coef_

array([[ 2.03269870e-04, -4.42109362e-04, -2.73914303e-04,
        -4.03829674e-05, -6.91858681e-05,  1.74164104e-05,
        -4.11463910e-04,  4.29279048e-05,  1.88599587e-04,
         4.01651554e-04,  5.00144777e-04]])