In [47]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pickle

In [39]:
train = pd.read_csv('../data/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5188 entries, 0 to 5187
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         5188 non-null   float64
 1   volatile acidity      5188 non-null   float64
 2   citric acid           5188 non-null   float64
 3   residual sugar        5188 non-null   float64
 4   chlorides             5188 non-null   float64
 5   free sulfur dioxide   5188 non-null   float64
 6   total sulfur dioxide  5188 non-null   float64
 7   density               5188 non-null   float64
 8   pH                    5188 non-null   float64
 9   sulphates             5188 non-null   float64
 10  alcohol               5188 non-null   float64
 11  quality               5188 non-null   int64  
 12  type_num              5188 non-null   int64  
 13  alc-sug               5188 non-null   float64
 14  qua-sug               5188 non-null   float64
 15  aci-type             

In [40]:
X = train.drop(columns=['quality'])  
y = train['quality']  

oversampler = RandomOverSampler(random_state=42)

X_resampled, y_resampled = oversampler.fit_resample(X, y)

df_balanced = pd.concat([X_resampled, y_resampled], axis=1)

In [41]:
X=df_balanced.drop(columns=['quality'])
y=df_balanced['quality']

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

# 1. Decision Tree Regressor

### Realizamos el GridSearch con DecisionTreeRegressor para buscar nuestros mejores parametros

In [43]:
dt = DecisionTreeRegressor(random_state=42)

param = {
    "max_depth": [3, 4, 5],
    "min_samples_split": [3, 4, 6],
    "min_samples_leaf": [2, 4, 6],
    "criterion": ["squared_error", "absolute_error"]
}

dt_gs = GridSearchCV(dt, param, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1)

dt_gs.fit(X_train, y_train)

best_model = dt_gs.best_estimator_

In [44]:
best_model

In [45]:
y_pred = best_model.predict(X_test)

In [46]:
mae = mean_absolute_error(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Absolute Percentage Error (MAPE): {mape}%')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

Mean Absolute Error (MAE): 0.6736976669862575
Mean Absolute Percentage Error (MAPE): 13.510934575172351%
Mean Squared Error (MSE): 1.2304250559284116
Root Mean Squared Error (RMSE): 1.1092452641000599


### Guardamos nuestro modelo en un .pkt

In [48]:
with open('../models/trained_model_reg_1DTR.pkl', 'wb') as archivo_salida :
    pickle.dump(best_model, archivo_salida)

# 2. Gradient Boosting Regressor

### Generamos el Pipeline con StandardScaler, GradientBoostingRegressor y sus parametros de busqueda

In [49]:
from sklearn.ensemble import GradientBoostingRegressor
pipe2 = Pipeline([
    ('scaler', StandardScaler()),  
    ('gb', GradientBoostingRegressor())  
])

param2 = {
    'gb__n_estimators': [50, 100, 200],
    'gb__learning_rate': [ 0.2, 0.5, 1],
    'gb__max_depth': [3, 4, 5,6]
}

gb_gs = GridSearchCV(pipe2, param2, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)

gb_gs.fit(X_train, y_train)

best_model2 = gb_gs.best_estimator_

In [50]:
best_model2

In [51]:
y_pred2 = best_model2.predict(X_test)

In [52]:
mae = mean_absolute_error(y_test, y_pred2)
mape = np.mean(np.abs((y_test - y_pred2) / y_test)) * 100
mse = mean_squared_error(y_test, y_pred2)
rmse = np.sqrt(mse)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Absolute Percentage Error (MAPE): {mape}%')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

Mean Absolute Error (MAE): 0.1332615908345914
Mean Absolute Percentage Error (MAPE): 2.346389360199186%
Mean Squared Error (MSE): 0.1267921426218284
Root Mean Squared Error (RMSE): 0.35607884326624684


### Guardamos nuestro modelo en un .pkt

In [53]:
with open('../models/trained_model_reg_2GBR.pkl', 'wb') as archivo_salida :
    pickle.dump(best_model2, archivo_salida)

# 3. Random Forest Regressor

### Generamos el Pipeline con StandardScaler, RandomForestRegressor y sus parametros de busqueda

In [54]:
from sklearn.ensemble import RandomForestRegressor
pipe3 = Pipeline([
    ('scaler', StandardScaler()),  
    ('rf', RandomForestRegressor())  
])

param3 = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [ 5, 10, 15],
    'rf__min_samples_split': [2, 5, 8],
    'rf__min_samples_leaf': [1, 2]
}

rf_gs = GridSearchCV(pipe3, param3, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

rf_gs.fit(X_train, y_train)
best_model3 = rf_gs.best_estimator_

In [55]:
best_model3

In [56]:
y_pred3 = best_model3.predict(X_test)

In [57]:
mae = mean_absolute_error(y_test, y_pred3)
mape = np.mean(np.abs((y_test - y_pred3) / y_test)) * 100
mse = mean_squared_error(y_test, y_pred3)
rmse = np.sqrt(mse)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Absolute Percentage Error (MAPE): {mape}%')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

Mean Absolute Error (MAE): 0.1507450072649242
Mean Absolute Percentage Error (MAPE): 2.6522345928028135%
Mean Squared Error (MSE): 0.09365463752010811
Root Mean Squared Error (RMSE): 0.3060304519489982


### Guardamos nuestro modelo en un .pkt

In [65]:
with open('../models/trained_model_reg_3RFR.pkl', 'wb') as archivo_salida :
    pickle.dump(best_model3, archivo_salida)

# 4. Lasso

### Generamos el Pipeline con StandardScaler, Lasso y sus parametros de busqueda

In [59]:
from sklearn.linear_model import Lasso
pipe4 = Pipeline([
    ('scaler', StandardScaler()),  
    ('lasso', Lasso())  
])

param4 = {
    'lasso__alpha': [0.01, 0.1, 1.0, 10.0]
}

lasso_gs = GridSearchCV(pipe4, param4, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

lasso_gs.fit(X_train, y_train)

best_model4 = lasso_gs.best_estimator_

In [60]:
best_model4

In [61]:
y_pred4 = best_model4.predict(X_test)

In [62]:
mae = mean_absolute_error(y_test, y_pred4)
mape = np.mean(np.abs((y_test - y_pred4) / y_test)) * 100
mse = mean_squared_error(y_test, y_pred4)
rmse = np.sqrt(mse)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Absolute Percentage Error (MAPE): {mape}%')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

Mean Absolute Error (MAE): 1.018178965122772
Mean Absolute Percentage Error (MAPE): 20.29344340423987%
Mean Squared Error (MSE): 1.6123115186279986
Root Mean Squared Error (RMSE): 1.269768293283463


### Guardamos nuestro modelo en un .pkt

In [63]:
with open('../models/trained_model_reg_4LASSO.pkl', 'wb') as archivo_salida :
    pickle.dump(best_model4, archivo_salida)

# 5. SVR

### Generamos el Pipeline con StandardScaler, SVR y sus parametros de busqueda

In [64]:
from sklearn.svm import SVR
pipe5 = Pipeline([
    ('scaler', StandardScaler()),  
    ('svr', SVR())  
])

param5 = {
    'svr__kernel': ['linear', 'rbf'],
    'svr__C': [0.1, 1, 10],
    'svr__epsilon': [0.1, 0.2, 0.5]
}

svr_gs = GridSearchCV(pipe5, param5, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)

svr_gs.fit(X_train, y_train)
best_model5 = svr_gs.best_estimator_

In [66]:
best_model5

In [67]:
y_pred5 = best_model5.predict(X_test)

In [68]:
mae = mean_absolute_error(y_test, y_pred5)
mape = np.mean(np.abs((y_test - y_pred5) / y_test)) * 100
mse = mean_squared_error(y_test, y_pred5)
rmse = np.sqrt(mse)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Absolute Percentage Error (MAPE): {mape}%')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

Mean Absolute Error (MAE): 0.3412567094709752
Mean Absolute Percentage Error (MAPE): 6.150589605620771%
Mean Squared Error (MSE): 0.3267562982696311
Root Mean Squared Error (RMSE): 0.5716260125900772


### Guardamos nuestro modelo en un .pkt

In [69]:
with open('../models/trained_model_reg_5SVC.pkl', 'wb') as archivo_salida :
    pickle.dump(best_model5, archivo_salida)

# No supervisado PCA

### Generamos el Pipeline con StandardScaler, PCA, RandomForestRegressor y sus parametros de busqueda

In [79]:
from sklearn.decomposition import PCA
pipe6 = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),  
    ('rf', RandomForestRegressor())
])
param6 = {
    'pca__n_components': [5, 10],
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [5, 10],
    'rf__min_samples_split': [2, 5, 8],
    }
pca_gs = GridSearchCV(pipe6,param6, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)

pca_gs.fit(X_train, y_train)
best_model6 = pca_gs.best_estimator_

In [80]:
best_model6

In [81]:
y_pred6 = best_model6.predict(X_test)

In [82]:
mae = mean_absolute_error(y_test, y_pred6)
mape = np.mean(np.abs((y_test - y_pred6) / y_test)) * 100
mse = mean_squared_error(y_test, y_pred6)
rmse = np.sqrt(mse)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Absolute Percentage Error (MAPE): {mape}%')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

Mean Absolute Error (MAE): 0.27678770851493845
Mean Absolute Percentage Error (MAPE): 4.827637270731422%
Mean Squared Error (MSE): 0.1991390790667187
Root Mean Squared Error (RMSE): 0.4462500185621494


### Guardamos nuestro modelo en un .pkt

In [83]:
with open('../models/trained_model_reg_6PCA.pkl', 'wb') as archivo_salida :
    pickle.dump(best_model6, archivo_salida)

# 7. PCA + Gradient Boosting

### Generamos el Pipeline con StandardScaler, PCA, GradientBoostingRegressor y sus parametros de busqueda

In [75]:
from sklearn.ensemble import GradientBoostingRegressor
pipe7 = Pipeline([
    ('scaler', StandardScaler()), 
    ('pca', PCA()), 
    ('gb', GradientBoostingRegressor())  
])

param7 = {
    'scaler': [None, StandardScaler()],
    'pca__n_components': [5, 10],
    'gb__n_estimators': [50, 100, 200],
    'gb__learning_rate': [ 0.2, 0.5, 1],
    'gb__max_depth': [3, 4, 5,6]
}

pcagb_gs = GridSearchCV(pipe7, param7, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)

pcagb_gs.fit(X_train, y_train)

best_model7 = pcagb_gs.best_estimator_

In [84]:
best_model7

In [85]:
y_pred7 = best_model7.predict(X_test)

In [86]:
mae = mean_absolute_error(y_test, y_pred7)
mape = np.mean(np.abs((y_test - y_pred7) / y_test)) * 100
mse = mean_squared_error(y_test, y_pred7)
rmse = np.sqrt(mse)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Absolute Percentage Error (MAPE): {mape}%')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

Mean Absolute Error (MAE): 0.14337316029362934
Mean Absolute Percentage Error (MAPE): 2.5243042169970646%
Mean Squared Error (MSE): 0.1368079274889028
Root Mean Squared Error (RMSE): 0.3698755567605175


### Guardamos nuestro modelo en un .pkt

In [87]:
with open('../models/trained_model_reg_7PCAGB.pkl', 'wb') as archivo_salida :
    pickle.dump(best_model7, archivo_salida)