In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:


# Load the data
crime_data = pd.read_csv("uscrime.txt", delimiter="\t")

# Split the data into training (60%), validation (20%), and test (20%) sets
train_data, temp_data = train_test_split(crime_data, test_size=0.4, random_state=123)
valid_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=123)

# Fit linear regression model with all predictors (lm1)
X_train = train_data.drop(columns=['Crime'])
y_train = train_data['Crime']

lm1 = LinearRegression()
lm1.fit(X_train, y_train)

# Summary of lm1
print(f"Intercept: {lm1.intercept_}")
print(f"Coefficients: {lm1.coef_}")

# Fit linear regression model with selected predictors (lm2)
selected_features = ['Ed', 'Po1', 'Ineq', 'Prob']
X_train_selected = train_data[selected_features]

lm2 = LinearRegression()
lm2.fit(X_train_selected, y_train)

# Summary of lm2
print(f"Intercept: {lm2.intercept_}")
print(f"Coefficients: {lm2.coef_}")

# Calculate MSE and RMSE on validation set for both models
X_valid = valid_data.drop(columns=['Crime'])
y_valid = valid_data['Crime']

X_valid_selected = valid_data[selected_features]

y_valid_pred_lm1 = lm1.predict(X_valid)
mse_lm1 = mean_squared_error(y_valid, y_valid_pred_lm1)
rmse_lm1 = np.sqrt(mse_lm1)

y_valid_pred_lm2 = lm2.predict(X_valid_selected)
mse_lm2 = mean_squared_error(y_valid, y_valid_pred_lm2)
rmse_lm2 = np.sqrt(mse_lm2)

print(f"Model lm1 - Validation MSE: {mse_lm1}, Validation RMSE: {rmse_lm1}")
print(f"Model lm2 - Validation MSE: {mse_lm2}, Validation RMSE: {rmse_lm2}")

# K-Fold Cross-Validation on training set for lm1
cv_scores_lm1 = cross_val_score(LinearRegression(), X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse_lm1 = np.sqrt(-cv_scores_lm1)

# K-Fold Cross-Validation on training set for lm2
cv_scores_lm2 = cross_val_score(LinearRegression(), X_train_selected, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse_lm2 = np.sqrt(-cv_scores_lm2)

print(f"Model lm1 - Cross-Validation RMSE: {cv_rmse_lm1.mean()}")
print(f"Model lm2 - Cross-Validation RMSE: {cv_rmse_lm2.mean()}")

# Determine the best model based on Cross-Validation results (RMSE)
best_model = lm1 if cv_rmse_lm1.mean() < cv_rmse_lm2.mean() else lm2

# Calculate metrics on test set for the best model
X_test = test_data.drop(columns=['Crime'])
y_test = test_data['Crime']

if best_model == lm1:
    y_test_pred = best_model.predict(X_test)
else:
    X_test_selected = test_data[selected_features]
    y_test_pred = best_model.predict(X_test_selected)

mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)

print(f"Best Model - Test MSE: {mse_test}, Test RMSE: {rmse_test}")

# Prepare a summary table
summary_table = pd.DataFrame({
    'Model': ['lm1', 'lm2'],
    'Validation_MSE': [mse_lm1, mse_lm2],
    'Validation_RMSE': [rmse_lm1, rmse_lm2],
    'CV_RMSE': [cv_rmse_lm1.mean(), cv_rmse_lm2.mean()],
    'Test_MSE': [np.nan, mse_test if best_model == lm2 else np.nan],
    'Test_RMSE': [np.nan, rmse_test if best_model == lm2 else np.nan]
})

print(summary_table)
