# Lab-4: Linear Regression on the Diabetes Dataset

**Objective:**

Build a simple linear regression model to predict disease progression one year after baseline using medical features in the Diabetes dataset. Perform data exploration, visualization, and evaluate model performance using MSE and R².

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import warnings
warnings.filterwarnings("ignore")


In [None]:
# Load the diabetes dataset
diabetes = load_diabetes()

# Create a DataFrame
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)

# Add the target column
df['target'] = diabetes.target

# Display the dataset description
print(diabetes.DESCR)


In [None]:
# Display the first 5 rows
df.head()

In [None]:
# Check data types and missing values
df.info()

In [None]:
# Summary statistics
df.describe()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Histograms for all features
df.hist(figsize=(12, 10))
plt.tight_layout()
plt.show()

In [None]:
# Scatter plots of features vs target
for col in diabetes.feature_names:
    plt.figure(figsize=(5, 3))
    sns.scatterplot(x=df[col], y=df['target'])
    plt.title(f'{col} vs Target')
    plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Feature and target split
X = df.drop('target', axis=1)
y = df['target']

# Split the data into train and test (80%-20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape
X_train.shape, X_test.shape

In [None]:
# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

In [None]:
# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"R-squared Score: {r2:.2f}")

In [None]:
# Display model coefficients and intercept
print("Intercept:", model.intercept_)
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
print(coefficients)