importing libraries

In [65]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


adding the dataset

In [None]:
df=pd.read_csv('boston_data.csv')
rename_dict = {
    "CRIM": "Crime_Rate",                # Per capita crime rate by town
    "ZN": "Residential_Land_Zone",       # Proportion of residential land zoned for large lots
    "INDUS": "Non_Retail_Land_Proportion", # Proportion of non-retail business acres per town
    "CHAS": "Proximity_to_Charles_River", # Charles River dummy variable (1 if tract bounds river, 0 otherwise)
    "NOX": "Nitric_Oxide_Concentration",  # Nitric oxide concentration (parts per 10 million)
    "RM": "Avg_Number_of_Rooms",          # Average number of rooms per dwelling
    "AGE": "Older_Homes_Proportion",      # Proportion of owner-occupied units built before 1940
    "DIS": "Distance_to_Employment_Centers", # Weighted distances to Boston employment centers
    "RAD": "Highway_Accessibility_Index",  # Accessibility to radial highways
    "TAX": "Property_Tax_Rate",            # Full-value property tax rate per $10,000
    "PTRATIO": "Pupil_Teacher_Ratio",      # Pupil-teacher ratio by town
    "B": "Black_Proportion",               # 1000(Bk - 0.63)^2 where Bk is the proportion of Black residents
    "LSTAT": "Lower_Status_Population",    # Percentage of lower status of the population
    "MEDV": "Median_House_Price"           # Median value of owner-occupied homes in $1000s
}

# Rename the columns
df.rename(columns=rename_dict, inplace=True)

# Display the updated dataset's column names
print("Updated column names:")
print(df.columns)

In [None]:
df

1. Exploratory Data Analysis (EDA)


In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isna()

In [None]:
df.duplicated()

VISUALIZING DATA  

In [None]:
#histogram
df.hist(bins=20, figsize=(15, 10))
plt.show()


In [None]:
#The Target Variable Distribution 
sns.histplot(df['Median_House_Price'], kde=True, bins=30)
plt.title('House Prices Distribution')
plt.xlabel('Price ($1000s)')
plt.ylabel('Number of Houses')
plt.show()

In [None]:
#boxplot to check for outliers
sns.boxplot(x=df["Median_House_Price"])
plt.title("Boxplot of MEDV (House Prices)")
plt.xlabel("House Price")
plt.show()

In [None]:
for col in df.columns:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
    plt.xlabel(col)
    plt.show()

ANALYZING RELATIONSHIPS

In [None]:
#CORRELATION MATRIX
corr = df.corr()
plt.figure(figsize=(12, 10))  # Adjust the width and height (12x10 here)
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()


feature selection

In [None]:
#dropped cuz it doesnt add any value to training the model as it doesnt affect the houses prices this much and isnt actually reasonable
df.drop(columns=['Proximity_to_Charles_River'],inplace=True) 


In [90]:
df.drop(columns=['Residential_Land_Zone'],inplace=True) #dropped cuz the data is all zeros and is destroying the training + not that important


In [91]:
df


Unnamed: 0,Crime_Rate,Non_Retail_Land_Proportion,Nitric_Oxide_Concentration,Avg_Number_of_Rooms,Older_Homes_Proportion,Distance_to_Employment_Centers,Highway_Accessibility_Index,Property_Tax_Rate,Pupil_Teacher_Ratio,Black_Proportion,Lower_Status_Population,Median_House_Price
0,0.006280,2.31,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.026587,7.07,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.026568,7.07,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.031360,2.18,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.064636,2.18,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.058973,11.93,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.043323,11.93,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.057311,11.93,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.098931,11.93,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [None]:
# Log transformation for `Crime_Rate` to reduce skewness
df['Crime_Rate'] = np.log1p(df['Crime_Rate'])  # log(1 + x) to avoid log(0)

Code to Apply Log Transformation and Prepare for Scaling


In [92]:
import numpy as np

# Scaling the entire dataset (excluding the target variable)
scaler = StandardScaler()

# Separate features and target
X = df.drop(columns=['Median_House_Price'])  # Replace 'Median_House_Price' with your target variable name
y = df['Median_House_Price']

# Apply scaling to the features
X_scaled = scaler.fit_transform(X)

# Convert the scaled features back to a DataFrame
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Combine the scaled features and the target variable
scaled_data = pd.concat([X_scaled, y.reset_index(drop=True)], axis=1)

# Display the first few rows of the scaled dataset
print(scaled_data.head())


   Crime_Rate  Non_Retail_Land_Proportion  Nitric_Oxide_Concentration  \
0   -0.952548                   -1.287909                   -0.144217   
1   -0.910465                   -0.593381                   -0.740262   
2   -0.910504                   -0.593381                   -0.740262   
3   -0.900573                   -1.306878                   -0.835284   
4   -0.831614                   -1.306878                   -0.835284   

   Avg_Number_of_Rooms  Older_Homes_Proportion  \
0             0.413672               -0.120013   
1             0.194274                0.367166   
2             1.282714               -0.265812   
3             1.016303               -0.809889   
4             1.228577               -0.511180   

   Distance_to_Employment_Centers  Highway_Accessibility_Index  \
0                        0.140214                    -0.982843   
1                        0.557160                    -0.867883   
2                        0.557160                    -0.867883

In [93]:
df

Unnamed: 0,Crime_Rate,Non_Retail_Land_Proportion,Nitric_Oxide_Concentration,Avg_Number_of_Rooms,Older_Homes_Proportion,Distance_to_Employment_Centers,Highway_Accessibility_Index,Property_Tax_Rate,Pupil_Teacher_Ratio,Black_Proportion,Lower_Status_Population,Median_House_Price
0,0.006280,2.31,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.026587,7.07,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.026568,7.07,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.031360,2.18,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.064636,2.18,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.058973,11.93,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.043323,11.93,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.057311,11.93,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.098931,11.93,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


TRAIN TEST SPLIT

In [94]:

# Separate features (X) and target variable (y)
X = scaled_data.drop(columns=['Median_House_Price'])  # Replace 'Median_House_Price' with your target variable
y = scaled_data['Median_House_Price']

# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check shapes to confirm the split
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")


Training set size: (404, 11)
Testing set size: (102, 11)


TRAINING THE MODEL 

In [95]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = linear_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2 * 100:.2f}%")  # Convert to percentage

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


R² Score: 64.82%
Mean Squared Error: 25.802283997499792
R^2 Score: 0.6481528309934507


In [96]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"R² Score: {r2_score(y_test, y_pred) * 100:.2f}%")


R² Score: 89.11%


In [None]:
# Display all data in a specific column (e.g., 'Crime_Rate')
print(df['Residential_Land_Zone'])
