In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Step 1: Read the dataset into a DataFrame
df = pd.read_csv("winequality-white.csv", delimiter=';')
df


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [9]:
# Step 2: Find missing values and drop them
if df.isnull().values.any():
    df.dropna(inplace=True)

In [17]:
# Step 3: Check data types for all features
df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [18]:
# Step 4: Extract dependent and independent variables
y = df['alcohol']
X = df.drop(columns=['alcohol'])

In [19]:
# Step 5: Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Step 6: Create a function to calculate RMSE, MAPE, and RMSLE
def evaluate_model(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    rmsle = np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true))**2))
    return rmse, mape, rmsle

In [21]:
# Step 7: Build and evaluate models
models = {'Linear Regression': LinearRegression(),
          'SVR': SVR(),
          'Ridge': Ridge(),
          'Lasso': Lasso(),
          'Decision Tree': DecisionTreeRegressor()}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse, mape, rmsle = evaluate_model(y_test, y_pred)
    results.append({'Model': name, 'RMSE': rmse, 'MAPE': mape, 'RMSLE': rmsle})

In [22]:
# Step 8: Store the results in a DataFrame for comparison
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,RMSE,MAPE,RMSLE
0,Linear Regression,0.384668,2.838162,0.033498
1,SVR,1.017035,7.410665,0.085891
2,Ridge,0.895389,6.731005,0.077448
3,Lasso,1.082089,8.20404,0.092155
4,Decision Tree,0.487597,2.768998,0.042159
