In [1]:
import pandas as pd  # Import pandas for data manipulation
import numpy as np  # Import numpy for numerical operations
from sklearn.model_selection import train_test_split  # Import train_test_split for splitting the data
from sklearn.preprocessing import StandardScaler, OneHotEncoder  # Import preprocessing tools for scaling and encoding
from sklearn.compose import ColumnTransformer  # Import ColumnTransformer for applying different transformations to columns
from sklearn.pipeline import Pipeline  # Import Pipeline for chaining preprocessing and modeling steps
from sklearn.linear_model import LinearRegression  # Import LinearRegression for the regression model
from sklearn.impute import SimpleImputer  # Import SimpleImputer for handling missing values
from sklearn.metrics import mean_squared_error, r2_score  # Import metrics for model evaluation

In [2]:
# Load the dataset from a URL
url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
df = pd.read_csv(url)  # Read the dataset into a pandas DataFrame

In [3]:
# Explore the dataset
print(df.head())  # Print the first few rows of the dataset to get an overview
print(df.info())  # Print information about the dataset, including data types and non-null counts
print(df.describe())  # Print statistical summary of numerical columns

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639

In [4]:
# Define features and target variable
X = df.drop("median_house_value", axis=1)  # Drop the target variable from the feature set
y = df["median_house_value"]  # Extract the target variable

In [5]:
# Identify categorical and numerical features
categorical_features = ["ocean_proximity"]  # List of categorical features
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()  # List of numerical features

In [6]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Replace missing values with the median value of each column
    ('scaler', StandardScaler())  # Standardize numerical features by scaling to mean 0 and variance 1
])

In [7]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Replace missing values with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Convert categorical values to one-hot encoded vectors
])

In [8]:
# Combine preprocessing steps for both numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),  # Apply numerical transformations
        ('cat', categorical_transformer, categorical_features)  # Apply categorical transformations
    ])

In [9]:
# Create a model pipeline combining preprocessing and regression model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),  # First step: preprocess data
    ('regressor', LinearRegression())  # Second step: apply linear regression
])

In [11]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # 80% training, 20% testing
# Train the model on the training data
model.fit(X_train, y_train)  # Fit the model using the training data


In [12]:
# Predict on the test set
y_pred = model.predict(X_test)  # Make predictions on the test data

In [13]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)  # Calculate the Mean Squared Error of predictions
r2 = r2_score(y_test, y_pred)  # Calculate the R^2 Score of predictions
print(f"Mean Squared Error: {mse}")  # Print the Mean Squared Error
print(f"R^2 Score: {r2}")  # Print the R^2 Score

Mean Squared Error: 4908290571.346432
R^2 Score: 0.6254382675296266
