In [None]:
import pandas as pd
df = pd.read_csv('ca_housing.csv')
df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Pairplot for first few columns to keep the plot readable
sns.pairplot(df[df.columns[:5]])
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Checking for missing values
missing_values = df.isnull().sum()
missing_values

In [None]:
# Filling missing values with median
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)

# Checking for missing values again
missing_values = df.isnull().sum()
missing_values

In [None]:
# Handling outliers using the IQR method
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

# Defining outliers
outliers = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))

# Removing outliers
df_out = df[~outliers.any(axis=1)]

# Checking the shape of the data before and after outlier removal
df.shape, df_out.shape

In [None]:
from sklearn.preprocessing import StandardScaler

# Standardizing numeric columns
numeric_cols = df_out.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
df_out[numeric_cols] = scaler.fit_transform(df_out[numeric_cols])

# Creating new features
df_out['rooms_per_bedroom'] = df_out['total_rooms'] / df_out['total_bedrooms']
df_out['population_per_household'] = df_out['population'] / df_out['households']

# One-hot encoding categorical variables
df_out = pd.get_dummies(df_out)

df_out.head()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Splitting the data into features (X) and target (y)
X = df_out.drop('median_house_value', axis=1)
y = df_out['median_house_value']

# Defining the models
models = [
    ('Linear Regression', LinearRegression()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor()),
    ('Support Vector Machine', SVR())
]

# Evaluating each model with 10-fold cross-validation
for name, model in models:
    cv_scores = cross_val_score(model, X, y, cv=10)
    print(f'{name}: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})')