In [None]:
# Experiment 4: Linear Regression
# Aim: Predict housing prices using the Boston dataset

from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Load the Boston Housing Dataset
boston = load_boston()  # Note: deprecated in latest versions, but still used for academic purposes

# Create DataFrame from data and add target column 'PRICE'
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['PRICE'] = boston.target

# Split the dataset into features (X) and target (y)
X = df.drop('PRICE', axis=1)  # All columns except 'PRICE'
y = df['PRICE']               # Target variable

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict prices for test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

# -------------------------------
# 📘 Working and Viva Questions
# -------------------------------

# Q1: What is Linear Regression?
# A1: It is a supervised learning algorithm that models the relationship between independent variables (features) and a continuous dependent variable (target).

# Q2: What does fit() do?
# A2: It trains the model on the training data to learn the relationship between features and the target variable.

# Q3: What is the use of train_test_split?
# A3: It splits the dataset into training and testing sets to check how well the model generalizes to unseen data.

# Q4: What is y_pred?
# A4: It contains the predicted values of housing prices generated by the trained model for the test data.

# Q5: What is Mean Squared Error (MSE)?
# A5: MSE is the average of the squared differences between actual and predicted values. Lower MSE indicates better performance.

# Q6: What is R2 Score?
# A6: Also called the coefficient of determination, it measures how well the model's predictions fit the actual data. R2 = 1 is perfect prediction; R2 = 0 means no prediction power.

# Q7: Why is Linear Regression used in this experiment?
# A7: To predict continuous values (house prices) based on numerical features like number of rooms, crime rate, etc.

# ✅ Done!
