In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
os.chdir = (r':D:\FCDS\Regularization')

In [3]:
train_df = pd.read_csv("train.csv")

In [4]:
validate_df = pd.read_csv("validation.csv")

In [5]:
train_df = train_df.drop(columns=['Units_sold>1000'])
validate_df = validate_df.drop(columns=['Units_sold>1000'])

In [6]:
train_df_encoded = pd.get_dummies(train_df, columns=['Segment'], drop_first=True)
validate_df_encoded = pd.get_dummies(validate_df, columns=['Segment'], drop_first=True)

In [7]:
# Separate features and target for both datasets
X_train = train_df_encoded.drop('Units_sold', axis=1)
y_train = train_df_encoded['Units_sold']

X_validate = validate_df_encoded.drop('Units_sold', axis=1)
y_validate = validate_df_encoded['Units_sold']

In [8]:
# Initialize the linear regression model
model = LinearRegression()


In [9]:
# Train the model using the training data
model.fit(X_train, y_train)

LinearRegression()

In [10]:
# Predict on the training data and the validation data
train_predictions = model.predict(X_train)
validate_predictions = model.predict(X_validate)

The R-squared metric, also known as the coefficient of determination, is a statistical measure used in machine learning to evaluate the performance of regression models. It quantifies how well the independent variables explain the variance of the dependent variable in a dataset. In machine learning, particularly in regression analysis, R-squared is used to assess how well a model fits data and predicts future outcomes. It helps in comparing different models to determine which one best explains the variability in the data.

R-squared only measures how well variables predict each other and does not imply causation between them. R-squared is a measure of goodness of fit, not a measure of the quality of the model.

Note: Causation refers to a relationship between two variables where a change in one variable directly results in a change in another variable. This implies a cause-and-effect relationship, meaning that one event (the cause) is responsible for producing another event (the effect).

Range: The R-squared value ranges from 0 to 1. A value closer to 1 indicates that a larger proportion of variance is explained by the model, suggesting a better fit. Conversely, a value closer to 0 indicates that the model does not explain much of the variance in the dependent variable134.

A high R-squared value does not necessarily indicate a good model. It can sometimes be misleading, especially if overfitting occurs, where the model captures noise rather than underlying patterns

Interpretation: An R-squared value of 1 means that the model perfectly explains all the variability of the response data around its mean. An R-squared value of 0 means that the model does not explain any of the variability23. For example, an R-squared value of 0.7 implies that 70% of the variance in the dependent variable is predictable from the independent variables7.


$$
R^2 = 1 - \frac{SS_{\text{res}}}{SS_{\text{tot}}}
$$

where $SS_{\text{res}}$ is the sum of the squared residuals (the differences between observed and predicted values) and $SS_{\text{tot}}$ is the total sum of squares (the variance of the observed data).

In [11]:
# Calculate R-squared for both datasets
train_r2 = r2_score(y_train, train_predictions)
validate_r2 = r2_score(y_validate, validate_predictions)


In [12]:
# Print out the metrics
print("Training R^2: {:.4f}".format(train_r2))
print("Validation R^2: {:.4f}".format(validate_r2))


Training R^2: 0.4607
Validation R^2: 0.4630
