In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
# validacao cruzada
from sklearn.model_selection import KFold, StratifiedKFold
# regressao penalizada
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler


df = pd.read_csv('/content/drive/MyDrive/FIAP/Deep Learning/MLP/winequalityN.csv')
print(df.shape)
df.head(5)

(6497, 13)


Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [None]:
df1 = df.copy()

# L1 ou L2 utilização a validação cruzada para determinar os melhores coeficientes para as variáveis dependentes.
# Exemplo de validação cruzada.
features = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']
target = ['quality']

df1.dropna(inplace=True)

# Separação entre treino e teste
x = df1[features]
y = df1[target]
x_columns = x.columns.to_list()
scaler = StandardScaler()
x = scaler.fit_transform(x)

df1.isnull().sum()

Unnamed: 0,0
type,0
fixed acidity,0
volatile acidity,0
citric acid,0
residual sugar,0
chlorides,0
free sulfur dioxide,0
total sulfur dioxide,0
density,0
pH,0


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

# Exemplificação de como o cross-validation trabalha
crossKfold = KFold(n_splits=5, shuffle=True, random_state = 42)
k = 0
for train, val in crossKfold.split(x_train,y_train):
  k += 1
  print(f'----Kfold: {k}')
  print('--Treino')
  print(train)
  print('--Validacao')
  print(val)

----Kfold: 1
--Treino
[   0    1    2 ... 5167 5168 5169]
--Validacao
[   8   12   17 ... 5158 5165 5166]
----Kfold: 2
--Treino
[   0    1    2 ... 5165 5166 5167]
--Validacao
[   6   14   19 ... 5163 5168 5169]
----Kfold: 3
--Treino
[   1    2    3 ... 5166 5168 5169]
--Validacao
[   0    7   22 ... 5152 5159 5167]
----Kfold: 4
--Treino
[   0    3    4 ... 5167 5168 5169]
--Validacao
[   1    2   10 ... 5153 5157 5161]
----Kfold: 5
--Treino
[   0    1    2 ... 5167 5168 5169]
--Validacao
[   3    4    5 ... 5155 5162 5164]


In [None]:
alphas = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.5, 1, 10]
for a in alphas:
  model = Ridge(alpha=a).fit(x_train,y_train)
  score = model.score(x_test, y_test)
  pred_y = model.predict(x_test)
  mse = mean_squared_error(y_test, pred_y)
  print("Alpha:{0:.6f}, R2:{1:.4f}, MSE:{2:.2f}, RMSE:{3:.2f}".format(a, score, mse, np.sqrt(mse)))


Alpha:0.000001, R2:0.3461, MSE:0.52, RMSE:0.72
Alpha:0.000010, R2:0.3461, MSE:0.52, RMSE:0.72
Alpha:0.000100, R2:0.3461, MSE:0.52, RMSE:0.72
Alpha:0.001000, R2:0.3461, MSE:0.52, RMSE:0.72
Alpha:0.010000, R2:0.3461, MSE:0.52, RMSE:0.72
Alpha:0.100000, R2:0.3461, MSE:0.52, RMSE:0.72
Alpha:0.500000, R2:0.3461, MSE:0.52, RMSE:0.72
Alpha:1.000000, R2:0.3461, MSE:0.52, RMSE:0.72
Alpha:10.000000, R2:0.3459, MSE:0.52, RMSE:0.72


In [None]:
# Define a forma de validação cruzada que será usada na regressão Ridge
cv = KFold(n_splits=10, shuffle=True, random_state=42)
model = RidgeCV(alphas=np.arange(0.1, 1, 0.1), cv=cv, scoring='neg_mean_squared_error')
model.fit(x_train, y_train)

# Indica o melhor alpha para a Ridge
print(model.alpha_)
print(model.score(x_test, y_test))
coef_ridge = pd.DataFrame(np.append(model.intercept_, model.coef_), ['intercepto'] + x_columns, columns=['Coeficientes'])

# Indica os coeficientes calculados para a Ridge
coef_ridge

0.9
0.34611795698924097


Unnamed: 0,Coeficientes
intercepto,5.821612
fixed acidity,0.077273
volatile acidity,-0.22362
citric acid,-0.022243
residual sugar,0.190095
chlorides,-0.02108
free sulfur dioxide,0.107385
total sulfur dioxide,-0.129402
density,-0.136622
pH,0.064426


In [None]:
#define cross-validation method to evaluate model
cv = KFold(n_splits=3, shuffle=True, random_state=42)
model = LassoCV(alphas=np.arange(0.000001, 1, 0.1), cv=cv, tol=1)
model.fit(x_train,y_train.values.ravel())
print(model.alpha_)
print(model.score(x_test, y_test))
coef_ridge = pd.DataFrame(np.append(model.intercept_, model.coef_), ['intercepto'] + x_columns, columns=['Coeficientes'])
coef_ridge

0.0
0.3341960122298021


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,Coeficientes
intercepto,5.821019
fixed acidity,-0.018269
volatile acidity,-0.261113
citric acid,0.025788
residual sugar,0.122467
chlorides,-0.046044
free sulfur dioxide,0.041773
total sulfur dioxide,-0.155099
density,-0.107198
pH,0.04741


In [None]:
# Criei meu modelo de Regressão Linear sem regularização
model = LinearRegression()
model.fit(x_train, y_train)
print(model.score(x_test, y_test))
lr_coef = pd.DataFrame(np.append(model.intercept_, model.coef_), ['intercepto'] + x_columns, columns=['Coeficientes'])
lr_coef

0.34613659277743813


Unnamed: 0,Coeficientes
intercepto,5.821612
fixed acidity,0.077497
volatile acidity,-0.223648
citric acid,-0.022288
residual sugar,0.190421
chlorides,-0.021027
free sulfur dioxide,0.107436
total sulfur dioxide,-0.129503
density,-0.13702
pH,0.064563
