In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
df = pd.read_csv('Boston.csv')
df

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Ensure df is a fresh copy
df = df.copy()

# Handle Missing Values
df.fillna(df.mean(numeric_only=True), inplace=True)  # Fill missing values with mean

#  Handle Outliers using IQR
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Replace outliers with median instead of removing them (better than dropping data)
for col in df.select_dtypes(include=[np.number]).columns:  # Apply only to numerical columns
    df[col] = np.where((df[col] < lower_bound[col]) | (df[col] > upper_bound[col]),
                        df[col].median(), df[col])

# Encode Categorical Data
categorical_features = df.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_features:
    df[col] = le.fit_transform(df[col])

#  Normalize/Standardize Data
scaler = MinMaxScaler()
df[df.columns] = scaler.fit_transform(df[df.columns])

# Display dataset after preprocessing
print("Data Preprocessing Completed!\n")
print(" Missing Values after Handling:\n", df.isnull().sum())  # Check if any missing values remain
print("\n Data Overview:\n", df.head())  # Display the first 5 rows
print("\n Data Shape:", df.shape)  # Show number of rows and columns



Data Preprocessing Completed!

🔹 Missing Values after Handling:
 crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
black      0
lstat      0
price      0
dtype: int64

🔹 Data Overview:
        crim   zn     indus  chas       nox        rm       age       dis  \
0  0.000000  0.6  0.067815   0.0  0.314815  0.602988  0.641607  0.365784   
1  0.002338  0.0  0.242302   0.0  0.172840  0.548203  0.782698  0.474158   
2  0.002336  0.0  0.242302   0.0  0.172840  0.819993  0.599382  0.474158   
3  0.002902  0.0  0.063050   0.0  0.150206  0.753469  0.441813  0.609467   
4  0.006988  0.0  0.063050   0.0  0.150206  0.806475  0.528321  0.609467   

        rad       tax   ptratio     black     lstat     price  
0  0.000000  0.208015  0.202381  1.000000  0.111761  0.595469  
1  0.043478  0.104962  0.500000  1.000000  0.254814  0.517799  
2  0.043478  0.104962  0.500000  0.921716  0.079092  0.941748  
3  0.086

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Pairplot to visualize feature relationships
sns.pairplot(df)
plt.show()

# Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()


In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'price' is the target variable
X = df.drop(columns=["price"])  # Features
y = df["price"]  # Target

# Split: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data Splitting Done!")
print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")


In [None]:
from sklearn.linear_model import LinearRegression

# Initialize and train model
model = LinearRegression()
model.fit(X_train, y_train)

print(" Model Training Completed!")


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Predictions
y_pred = model.predict(X_test)

# Evaluation Metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f" R² Score: {r2:.2f}")


In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.7, color="blue")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color="red", linestyle="--")
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted Prices")
plt.show()
