In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the data
data = pd.read_csv("C:\Data home 2\housing.csv\housing.csv")

In [None]:
# EDA
# Look at the median_house_value variable
print("Question 1:")
print("There's one feature with missing values. What is it?")
print(data.isnull().sum())

# Preparing the dataset
data = data[data['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]
selected_columns = ['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']
data = data[selected_columns]

In [None]:
# Question 2
print("\nQuestion 2:")
print("What's the median (50% percentile) for variable 'population'?")
print(data['population'].median())

# Prepare and split the dataset
np.random.seed(42)  # Set seed for reproducibility
data = data.sample(frac=1).reset_index(drop=True)  # Shuffle the dataset
X = data.drop('median_house_value', axis=1)
y = np.log1p(data['median_house_value'])  # Apply log transformation

X_train, X_val, X_test = np.split(X, [int(0.6 * len(X)), int(0.8 * len(X))])
y_train, y_val, y_test = np.split(y, [int(0.6 * len(y)), int(0.8 * len(y))])


In [None]:
# Question 3
print("\nQuestion 3:")
print("We need to deal with missing values for the column from Q1.")
print("Try both options: Fill it with 0 or with the mean of this variable.")
print("For each, train a linear regression model without regularization.")
print("Use the validation dataset to evaluate the models and compare the RMSE of each option.")
for strategy in ['zero', 'mean']:
    if strategy == 'zero':
        X_train_fill = X_train.fillna(0)
        X_val_fill = X_val.fillna(0)
    else:
        mean = X_train['total_bedrooms'].mean()
        X_train_fill = X_train.fillna(mean)
        X_val_fill = X_val.fillna(mean)

    model = LinearRegression()
    model.fit(X_train_fill, y_train)
    y_pred = model.predict(X_val_fill)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f'RMSE with {strategy}: {round(rmse, 2)}')

In [None]:
# Question 4
print("\nQuestion 4:")
print("Now let's train a regularized linear regression.")
print("For this question, fill the NAs with 0.")
print("Try different values of r from this list: [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10].")
print("Use RMSE to evaluate the model on the validation dataset.")
for r in [0, 1e-6, 1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10]:
    model = LinearRegression()
    model.fit(X_train.fillna(0), y_train)
    y_pred = model.predict(X_val.fillna(0))
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f'RMSE with r={r}: {round(rmse, 2)}')

In [None]:
# Question 5
print("\nQuestion 5:")
print("We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.")
print("Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].")
print("For each seed, do the train/validation/test split with 60%/20%/20% distribution.")
print("Fill the missing values with 0 and train a model without regularization.")
seed_scores = []
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
for seed in seeds:
    X_train, X_val, X_test = np.split(X.sample(frac=1, random_state=seed), [int(0.6 * len(X)), int(0.8 * len(X))])
    y_train, y_val, y_test = np.split(y.sample(frac=1, random_state=seed), [int(0.6 * len(y)), int(0.8 * len(y))])
    model = LinearRegression()
    model.fit(X_train.fillna(0), y_train)
    y_pred = model.predict(X_val.fillna(0))
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    seed_scores.append(rmse)

std_deviation = np.std(seed_scores)
print(f'Standard Deviation of RMSE scores: {round(std_deviation, 3)}')

In [None]:
# Question 6
print("\nQuestion 6:")
print("Split the dataset like previously, use seed 9.")
print("Combine train and validation datasets.")
print("Fill the missing values with 0 and train a model with r=0.001.")
X_train, X_val, X_test = np.split(X.sample(frac=1, random_state=9), [int(0.6 * len(X)), int(0.8 * len(X))])
y_train, y_val, y_test = np.split(y.sample(frac=1, random_state=9), [int(0.6 * len(y)), int(0.8 * len(y))])
X_train = pd.concat([X_train, X_val])
y_train = pd.concat([y_train, y_val])
model = LinearRegression()
model.fit(X_train.fillna(0), y_train)
y_pred_test = model.predict(X_test.fillna(0))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(f'RMSE on the test dataset: {round(rmse_test, 2)}')