# Wine Quality Dataset
## l1 , l2 (regularization) with k fold (cross validation)
![image](https://raw.githubusercontent.com/Masterx-AI/Project_Wine_Quality_Investigation/main/wq.jpg)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Load the dataset
data = pd.read_csv('/content/WineQT.csv')

In [None]:
# Head of the dataframe
print(data.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  Id  
0      9.4        5   0  
1      9.8        5   1  
2      9

In [None]:
# Tail of the dataframe
print(data.tail())

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
1138            6.3             0.510         0.13             2.3      0.076   
1139            6.8             0.620         0.08             1.9      0.068   
1140            6.2             0.600         0.08             2.0      0.090   
1141            5.9             0.550         0.10             2.2      0.062   
1142            5.9             0.645         0.12             2.0      0.075   

      free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
1138                 29.0                  40.0  0.99574  3.42       0.75   
1139                 28.0                  38.0  0.99651  3.42       0.82   
1140                 32.0                  44.0  0.99490  3.45       0.58   
1141                 39.0                  51.0  0.99512  3.52       0.76   
1142                 32.0                  44.0  0.99547  3.57       0.71   

      alcohol  quality    Id  
1138     11.0      

In [None]:
# Descriptive statistics of the dataframe
print(data.describe())

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1143.000000       1143.000000  1143.000000     1143.000000   
mean        8.311111          0.531339     0.268364        2.532152   
std         1.747595          0.179633     0.196686        1.355917   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.392500     0.090000        1.900000   
50%         7.900000          0.520000     0.250000        2.200000   
75%         9.100000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  1143.000000          1143.000000           1143.000000  1143.000000   
mean      0.086933            15.615486             45.914698     0.996730   
std       0.047267            10.250486             32.782130     0.001925   
min       0.012000             1.000000         

In [None]:
# Descriptive statistics of the dataframe
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1143 non-null   float64
 1   volatile acidity      1143 non-null   float64
 2   citric acid           1143 non-null   float64
 3   residual sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free sulfur dioxide   1143 non-null   float64
 6   total sulfur dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
 11  quality               1143 non-null   int64  
 12  Id                    1143 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 116.2 KB
None


In [None]:
# Convert 'Id' column to float
data['Id'] = data['Id'].astype(float)

# Convert 'quality' column to float
data['quality'] = data['quality'].astype(float)

print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1143 non-null   float64
 1   volatile acidity      1143 non-null   float64
 2   citric acid           1143 non-null   float64
 3   residual sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free sulfur dioxide   1143 non-null   float64
 6   total sulfur dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
 11  quality               1143 non-null   float64
 12  Id                    1143 non-null   float64
dtypes: float64(13)
memory usage: 116.2 KB
None


In [None]:
# Separate features (X) and target (y)
X = data.drop('quality', axis=1)
y = data['quality']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 0.3824283521291972
R-squared: 0.3127638539508183


In [None]:
from sklearn.model_selection import KFold

# Initialize KFold with 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a list to store cross-validated scores
cv_scores = []

# Iterate through the folds
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Train the model on the current fold
    model = LinearRegression()
    model.fit(X_train_fold, y_train_fold)

    # Make predictions on the validation set
    y_pred_fold = model.predict(X_val_fold)

    # Calculate the score (e.g., R-squared) for the current fold
    r2_fold = r2_score(y_val_fold, y_pred_fold)
    cv_scores.append(r2_fold)

# Calculate the average cross-validated score
average_cv_score = sum(cv_scores) / len(cv_scores)

print("Cross-Validated R-squared Scores:", cv_scores)
print("Average Cross-Validated R-squared:", average_cv_score)

Cross-Validated R-squared Scores: [0.3412693271855066, 0.36488133065648265, 0.3625937104370105, 0.27035546793445075, 0.3960319241887754]
Average Cross-Validated R-squared: 0.3470263520804452


In [None]:
from sklearn.linear_model import Ridge, Lasso

# Initialize Ridge (L2) and Lasso (L1) models
ridge_model = Ridge(alpha=1.0)
lasso_model = Lasso(alpha=1.0)

# Perform cross-validation for Ridge
ridge_cv_scores = []
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    ridge_model.fit(X_train_fold, y_train_fold)
    y_pred_fold = ridge_model.predict(X_val_fold)
    r2_fold = r2_score(y_val_fold, y_pred_fold)
    ridge_cv_scores.append(r2_fold)

average_ridge_cv_score = sum(ridge_cv_scores) / len(ridge_cv_scores)
print("Ridge Cross-Validated R-squared Scores:", ridge_cv_scores)
print("Average Ridge Cross-Validated R-squared:", average_ridge_cv_score)

Ridge Cross-Validated R-squared Scores: [0.3420590914526599, 0.3651868038215017, 0.36260933047291477, 0.27061704715443824, 0.39581313352948466]
Average Ridge Cross-Validated R-squared: 0.3472570812861998


In [None]:
# Perform cross-validation for Lasso
lasso_cv_scores = []
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    lasso_model.fit(X_train_fold, y_train_fold)
    y_pred_fold = lasso_model.predict(X_val_fold)
    r2_fold = r2_score(y_val_fold, y_pred_fold)
    lasso_cv_scores.append(r2_fold)

average_lasso_cv_score = sum(lasso_cv_scores) / len(lasso_cv_scores)
print("Lasso Cross-Validated R-squared Scores:", lasso_cv_scores)
print("Average Lasso Cross-Validated R-squared:", average_lasso_cv_score)

Lasso Cross-Validated R-squared Scores: [-0.021506896129041664, -0.0002557072840272401, -0.005409748215411847, -0.0007465746671997397, -0.005629016744555182]
Average Lasso Cross-Validated R-squared: -0.006709588608047134


In [None]:
# Compare the performance of the models
print("\nModel Performance Comparison:")
print("Linear Regression Average Cross-Validated R-squared:", average_cv_score)
print("Ridge Regression Average Cross-Validated R-squared:", average_ridge_cv_score)
print("Lasso Regression Average Cross-Validated R-squared:", average_lasso_cv_score)


Model Performance Comparison:
Linear Regression Average Cross-Validated R-squared: 0.3470263520804452
Ridge Regression Average Cross-Validated R-squared: 0.3472570812861998
Lasso Regression Average Cross-Validated R-squared: -0.006709588608047134


In [None]:
from sklearn.linear_model import ElasticNet

# Initialize ElasticNet model
elasticnet_model = ElasticNet(alpha=1.0, l1_ratio=0.1)

# Perform cross-validation for ElasticNet
elasticnet_cv_scores = []
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    elasticnet_model.fit(X_train_fold, y_train_fold)
    y_pred_fold = elasticnet_model.predict(X_val_fold)
    r2_fold = r2_score(y_val_fold, y_pred_fold)
    elasticnet_cv_scores.append(r2_fold)

average_elasticnet_cv_score = sum(elasticnet_cv_scores) / len(elasticnet_cv_scores)
print("ElasticNet Cross-Validated R-squared Scores:", elasticnet_cv_scores)
print("Average ElasticNet Cross-Validated R-squared:", average_elasticnet_cv_score)

ElasticNet Cross-Validated R-squared Scores: [0.22109716354024556, 0.24428875573304643, 0.23520334534363585, 0.22095382840099298, 0.23136201497638254]
Average ElasticNet Cross-Validated R-squared: 0.23058102159886068


In [None]:
# Compare the performance of the models
print("\nModel Performance Comparison:")
print("Linear Regression Average Cross-Validated R-squared:", average_cv_score)
print("Ridge Regression Average Cross-Validated R-squared:", average_ridge_cv_score)
print("Lasso Regression Average Cross-Validated R-squared:", average_lasso_cv_score)
print("ElasticNet Regression Average Cross-Validated R-squared:", average_elasticnet_cv_score)


Model Performance Comparison:
Linear Regression Average Cross-Validated R-squared: 0.3470263520804452
Ridge Regression Average Cross-Validated R-squared: 0.3472570812861998
Lasso Regression Average Cross-Validated R-squared: -0.006709588608047134
ElasticNet Regression Average Cross-Validated R-squared: 0.23058102159886068


In [None]:
# Analyze the results
print("\nAnalysis:")
if average_ridge_cv_score > average_cv_score:
  print("Ridge Regression outperforms Linear Regression, indicating that L2 regularization helps reduce overfitting.")
if average_lasso_cv_score > average_cv_score:
  print("Lasso Regression outperforms Linear Regression, suggesting that L1 regularization can also improve model performance by reducing variance and potentially selecting important features.")
if average_elasticnet_cv_score > average_cv_score:
    print("ElasticNet Regression outperforms Linear Regression, indicating that a combination of L1 and L2 regularization is effective in improving model performance and controlling overfitting.")


# Analyze how regularization impacts model performance
if average_ridge_cv_score > average_cv_score or average_lasso_cv_score > average_cv_score or average_elasticnet_cv_score > average_cv_score:
    print("Regularization techniques (Ridge, Lasso, ElasticNet) generally improve model performance by reducing variance and controlling overfitting, leading to better generalization to unseen data.")
else:
    print("Regularization did not significantly improve model performance in this case. The model might not be experiencing significant overfitting.")


Analysis:
Ridge Regression outperforms Linear Regression, indicating that L2 regularization helps reduce overfitting.
Regularization techniques (Ridge, Lasso, ElasticNet) generally improve model performance by reducing variance and controlling overfitting, leading to better generalization to unseen data.


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Ridge, Lasso, and ElasticNet
param_grid_ridge = {'alpha': [0.1, 1.0, 10.0]}
param_grid_lasso = {'alpha': [0.1, 1.0, 10.0]}
param_grid_elasticnet = {'alpha': [0.1, 1.0, 10.0], 'l1_ratio': [0.1, 0.5, 0.9]}

# Create GridSearchCV objects for each model
grid_search_ridge = GridSearchCV(Ridge(), param_grid_ridge, cv=5, scoring='r2')
grid_search_lasso = GridSearchCV(Lasso(), param_grid_lasso, cv=5, scoring='r2')
grid_search_elasticnet = GridSearchCV(ElasticNet(), param_grid_elasticnet, cv=5, scoring='r2')

# Fit the GridSearchCV objects to the training data
grid_search_ridge.fit(X_train, y_train)
grid_search_lasso.fit(X_train, y_train)
grid_search_elasticnet.fit(X_train, y_train)

# Print the best parameters and scores for each model
print("Ridge Best Parameters:", grid_search_ridge.best_params_)
print("Ridge Best Score:", grid_search_ridge.best_score_)

print("Lasso Best Parameters:", grid_search_lasso.best_params_)
print("Lasso Best Score:", grid_search_lasso.best_score_)

print("ElasticNet Best Parameters:", grid_search_elasticnet.best_params_)
print("ElasticNet Best Score:", grid_search_elasticnet.best_score_)

Ridge Best Parameters: {'alpha': 10.0}
Ridge Best Score: 0.3549123956155578
Lasso Best Parameters: {'alpha': 0.1}
Lasso Best Score: 0.31544780926112004
ElasticNet Best Parameters: {'alpha': 0.1, 'l1_ratio': 0.1}
ElasticNet Best Score: 0.355497599623415
