In [14]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('data.csv', sep=';')
X = df.drop(['Class', 'Output'], axis=1)
y = df['Output']

In [15]:
# Build Linear Regression Model
linear_model = LinearRegression()

# Fit the model on the training data
linear_model.fit(X, y)

# Make predictions on training data
y_pred_train = linear_model.predict(X)

# Calculate training R^2 and RMSE
train_mse = mean_squared_error(y, y_pred_train)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y, y_pred_train)

print(f"Training R² Score: {train_r2:.4f}")
print(f"Training RMSE: {train_rmse:.4f}")

Training R² Score: 0.6080
Training RMSE: 2.2302


In [20]:

# Use 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Calculate cross-validation scores for different metrics
cv_r2_scores = cross_val_score(linear_model, X, y, cv=kf, scoring='r2')
cv_neg_mse_scores = cross_val_score(linear_model, X, y, cv=kf, scoring='neg_mean_squared_error')

# Convert negative scores to positive
cv_mse_scores = -cv_neg_mse_scores
cv_rmse_scores = np.sqrt(cv_mse_scores)

# Calculate mean
cv_r2_mean = cv_r2_scores.mean()
cv_rmse_mean = cv_rmse_scores.mean()

print("Performance Comparaison:")
print(f"Training R²: {train_r2:.4f}, Training RMSE: {train_rmse:.4f}")
print(f"Mean CV R²: {cv_r2_scores.mean():.4f}, Mean CV RMSE: {cv_rmse_scores.mean():.4f}")
print(f"Difference in R²: {train_r2 - cv_r2_mean:.4f}, Difference in RMSE: {train_rmse - cv_rmse_mean:.4f}")

Performance Comparaison:
Training R²: 0.6080, Training RMSE: 2.2302
Mean CV R²: 0.3750, Mean CV RMSE: 2.8024
Difference in R²: 0.2330, Difference in RMSE: -0.5722


### Comment

There are huge differences betweeen train and test R^2 and RMSE scores. It strongly suggests overfitting and model's poor capability to generalize beyond training data.