<a href="https://colab.research.google.com/github/m1nc/CS4375-linear-regression-gradient-descent/blob/main/part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [64]:
import numpy as np #numbers
import pandas as pd # data
import matplotlib.pyplot as plt #plots

from sklearn.model_selection import train_test_split #data split
from sklearn.preprocessing import StandardScaler #standardizer
from sklearn.linear_model import SGDRegressor #linear regression w/ gradient descent
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score



In [65]:
#loading data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

df = pd.read_csv(url, sep=";")

df.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [66]:
print("Shape:", df.shape)
print("\nMissing values:\n", df.isnull().sum())
print("\nDuplicate rows:", df.duplicated().sum())


Shape: (1599, 12)

Missing values:
 fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

Duplicate rows: 240


In [67]:
#removing duplicates only since there are no missing values
df = df.drop_duplicates()


In [68]:
#defining features and targets

X = df.drop("quality", axis=1)
y = df["quality"]


In [69]:

#standardizes features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [70]:
#Splits between training and testing data
#80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

#print("# of training samples:", X_train.shape[0])
#print("# of Test samples:", X_test.shape[0])


In [71]:
# tuning parameters
learning_rates = [0.0001, 0.001, 0.01, 0.1]
iterations = [500, 1000, 2000]

results = []

best_test_mse = float("inf")
best_params = None

with open("part2_log.txt", "w") as f:

    for lr in learning_rates:
        for it in iterations:

            model = SGDRegressor(
                learning_rate='constant',
                eta0=lr,
                max_iter=it,
                random_state=42
            )

            model.fit(X_train, y_train)

            y_train_pred = model.predict(X_train)#makes predictions
            y_test_pred = model.predict(X_test)

            #compute errors
            train_mse = mean_squared_error(y_train, y_train_pred)
            test_mse = mean_squared_error(y_test, y_test_pred)
            r2 = r2_score(y_test, y_test_pred)
            evs = explained_variance_score(y_test, y_test_pred)

            results.append([lr, it, train_mse, test_mse])

            # writing each trial
            f.write(
                f"LearningRate={lr}, Iterations={it}, "
                f"Train MSE={train_mse:.4f}, "
                f"Test MSE={test_mse:.4f}, "
                f"R2={r2:.4f}, "
                f"EVS={evs:.4f}\n"
            )

            # checks if model is the best
            if test_mse < best_test_mse:
                best_test_mse = test_mse
                best_params = (lr, it)

    # Write best results at end
    f.write("\nBest Parameters:\n")
    f.write(f"Learning Rate: {best_params[0]}\n")
    f.write(f"Iterations: {best_params[1]}\n")
    f.write(f"Best Test MSE: {best_test_mse:.4f}\n")


In [72]:

results_df = pd.DataFrame(results, columns=["Learning Rate", "Iterations", "Train MSE", "Test MSE"])
results_df.sort_values("Test MSE")


Unnamed: 0,Learning Rate,Iterations,Train MSE,Test MSE
0,0.0001,500,0.4388765,0.4164974
1,0.0001,1000,0.4388765,0.4164974
2,0.0001,2000,0.4388765,0.4164974
3,0.001,500,0.4352237,0.4223824
4,0.001,1000,0.4352237,0.4223824
5,0.001,2000,0.4352237,0.4223824
6,0.01,500,0.4404064,0.4238593
7,0.01,1000,0.4404064,0.4238593
8,0.01,2000,0.4404064,0.4238593
9,0.1,500,9589847000.0,9679613000.0


In [73]:
#building final model

#extracts best learning rate
#best_params = (best_learning_rate, best_iterations)
best_lr = best_params[0]
best_iter = best_params[1]

final_model = SGDRegressor(
    learning_rate='constant',
    eta0=best_lr,
    max_iter=best_iter,
    random_state=42
)

final_model.fit(X_train, y_train) #trains final model


In [74]:
#make predictions
y_train_pred = final_model.predict(X_train)
y_test_pred = final_model.predict(X_test)

#calculate training MSE
train_mse = mean_squared_error(y_train, y_train_pred)
#calculate test MSE
test_mse = mean_squared_error(y_test, y_test_pred)

#calculate R^2 Score
r2 = r2_score(y_test, y_test_pred)

#calculate Explained Variance
evs = explained_variance_score(y_test, y_test_pred)

#Print results
print("Train MSE:", train_mse)
print("Test MSE:", test_mse)
print("R2 Score:", r2)
print("Explained Variance:", evs)


Train MSE: 0.43887653936636833
Test MSE: 0.41649735689201794
R2 Score: 0.41202243111990666
Explained Variance: 0.4122181565158416


In [75]:
#Remove this in final code
#testing model with data outside dataset
new_wine_df = pd.DataFrame(
    [[7.6, 0.68, 0.04, 2.1, 0.078,
      13, 38, 0.9982, 3.48,
      0.58, 9.6]],
    columns=X.columns
)

# Scale
new_wine_scaled = scaler.transform(new_wine_df)

# Predict
prediction = final_model.predict(new_wine_scaled)

print("Predicted Quality:", prediction[0])


Predicted Quality: 5.066691874822032


In [76]:
logging.basicConfig(
    filename="part2_log.txt",
    level=logging.INFO,
    format="%(asctime)s - %(message)s",
    filemode="w"  # overwrite each run
)




/content
part2_log.txt  sample_data  test_file.txt
total 24
drwxr-xr-x 1 root root 4096 Feb 16 04:02 .
drwxr-xr-x 1 root root 4096 Feb 16 03:16 ..
drwxr-xr-x 4 root root 4096 Jan 16 14:24 .config
-rw-r--r-- 1 root root 1316 Feb 16 04:02 part2_log.txt
drwxr-xr-x 1 root root 4096 Jan 16 14:24 sample_data
-rw-r--r-- 1 root root    5 Feb 16 04:02 test_file.txt
