# PS1 - DS 5220: Supervised Machine Learning and Learning Theory

## Omer Seyfeddin Koc

The python codes used to answer questions about California House Price data are as follows:

In [154]:
# Common imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.pipeline import make_pipeline


# data visualization
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# random seed to make output stable across runs
np.random.seed(42)

In [155]:
# Load the data into a Pandas DataFrame
df = pd.read_csv('housing.csv')

In [156]:
# Display the first few rows of the data
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [157]:
# Print out summary statistics for each numerical feature
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [158]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [159]:
correlations = df.corr()
correlations["median_house_value"].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.688075
total_rooms           0.134153
housing_median_age    0.105623
households            0.065843
total_bedrooms        0.049686
population           -0.024650
longitude            -0.045967
latitude             -0.144160
Name: median_house_value, dtype: float64

In [160]:
# compute the correlation matrix
corr_matrix = df.corr()

# print the correlation matrix
corr_matrix

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
longitude,1.0,-0.924664,-0.108197,0.044568,0.069608,0.099773,0.05531,-0.015176,-0.045967
latitude,-0.924664,1.0,0.011173,-0.0361,-0.066983,-0.108785,-0.071035,-0.079809,-0.14416
housing_median_age,-0.108197,0.011173,1.0,-0.361262,-0.320451,-0.296244,-0.302916,-0.119034,0.105623
total_rooms,0.044568,-0.0361,-0.361262,1.0,0.93038,0.857126,0.918484,0.19805,0.134153
total_bedrooms,0.069608,-0.066983,-0.320451,0.93038,1.0,0.877747,0.979728,-0.007723,0.049686
population,0.099773,-0.108785,-0.296244,0.857126,0.877747,1.0,0.907222,0.004834,-0.02465
households,0.05531,-0.071035,-0.302916,0.918484,0.979728,0.907222,1.0,0.013033,0.065843
median_income,-0.015176,-0.079809,-0.119034,0.19805,-0.007723,0.004834,0.013033,1.0,0.688075
median_house_value,-0.045967,-0.14416,0.105623,0.134153,0.049686,-0.02465,0.065843,0.688075,1.0


# Cleaning Data

In [161]:
# Show a sample of the data set before and after the cleaning.
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [162]:
# Check null numbers
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [163]:
df.shape

(20640, 10)

In [164]:
# drop null rows.
df=df.dropna()
df.shape

(20433, 10)

# Categorical Cleaning

In [165]:
df['ocean_proximity'].value_counts()

<1H OCEAN     9034
INLAND        6496
NEAR OCEAN    2628
NEAR BAY      2270
ISLAND           5
Name: ocean_proximity, dtype: int64

In [166]:
df = df.drop(df[df['ocean_proximity'] == 'ISLAND'].index)
df.shape

(20428, 10)

In [167]:
df = pd.get_dummies(df, columns=['ocean_proximity'])
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,1,0


In [168]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
count,20428.0,20428.0,20428.0,20428.0,20428.0,20428.0,20428.0,20428.0,20428.0,20428.0,20428.0,20428.0,20428.0
mean,-119.570986,35.633778,28.629724,2636.764147,537.899305,1425.13222,499.488007,3.871437,206821.928432,0.442236,0.317995,0.111122,0.128647
std,2.003732,2.136312,12.590156,2185.451409,421.42597,1133.277418,382.326831,1.899432,115412.337452,0.496664,0.465709,0.314291,0.334817
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0,0.0,0.0,0.0,0.0
25%,-121.8,33.93,18.0,1450.0,296.0,788.0,280.0,2.5634,119475.0,0.0,0.0,0.0,0.0
50%,-118.5,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5375,179700.0,0.0,0.0,0.0,0.0
75%,-118.01,37.72,37.0,3143.25,647.0,1723.0,604.0,4.744075,264700.0,1.0,1.0,0.0,0.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0,1.0,1.0,1.0,1.0


In [169]:
df.shape

(20428, 13)

In [170]:
# Show a sample of the data set before and after the cleaning.
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,1,0


In [171]:
#Create new features.

df["avgroom"] = df["total_rooms"]/df["households"]
df["roomsperbedrooms"] = df["total_rooms"]/df["total_bedrooms"]

In [172]:
#looking at correlation with target value

corr_matrix = df.corr()
corr_matrix["median_house_value"].sort_values(ascending = False)

median_house_value            1.000000
median_income                 0.688848
roomsperbedrooms              0.384519
ocean_proximity_<1H OCEAN     0.258051
ocean_proximity_NEAR BAY      0.160710
avgroom                       0.151365
ocean_proximity_NEAR OCEAN    0.140567
total_rooms                   0.133516
housing_median_age            0.106077
households                    0.065122
total_bedrooms                0.049792
population                   -0.025069
longitude                    -0.045642
latitude                     -0.144312
ocean_proximity_INLAND       -0.484721
Name: median_house_value, dtype: float64

In [173]:
# test and train data
X = df.drop(['median_house_value'], axis=1)
Y = df['median_house_value']

In [174]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [175]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((16342, 14), (4086, 14), (16342,), (4086,))

In [176]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into training and test sets
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [182]:
# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [178]:
# Make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [179]:
# Compute RMSE and R2 score for the training set
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
r2_train = r2_score(y_train, y_train_pred)

In [180]:
# Compute RMSE and R2 score for the test set
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
r2_test = r2_score(y_test, y_test_pred)

In [181]:
# Print the results
print("Training set RMSE: {:.2f}".format(rmse_train))
print("Training set R2 score: {:.2f}".format(r2_train))
print("Test set RMSE: {:.2f}".format(rmse_test))
print("Test set R2 score: {:.2f}".format(r2_test))

Training set RMSE: 67977.93
Training set R2 score: 0.65
Test set RMSE: 69511.31
Test set R2 score: 0.64


# Ridge Regression

In [191]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [192]:
ridge = Ridge(alpha=1)
ridge.fit(X_train, y_train)

Ridge(alpha=1)

In [201]:
alphas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

best_alpha = None
best_rmse = float('inf')
for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    y_train_pred = ridge.predict(X_train)
    y_test_pred = ridge.predict(X_test)
    rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
    rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
    if rmse_test < best_rmse:
        best_rmse = rmse_test
        best_alpha = alpha
    print(f'Alpha: {alpha:.3f}, RMSE (train): {rmse_train:.3f}, RMSE (test): {rmse_test:.3f}')
print(f'Best alpha: {best_alpha:.3f}, Best RMSE: {best_rmse:.3f}')

Alpha: 0.001, RMSE (train): 67977.934, RMSE (test): 69511.305
Alpha: 0.010, RMSE (train): 67977.934, RMSE (test): 69511.305
Alpha: 0.100, RMSE (train): 67977.934, RMSE (test): 69511.298
Alpha: 1.000, RMSE (train): 67977.934, RMSE (test): 69511.229
Alpha: 10.000, RMSE (train): 67977.953, RMSE (test): 69510.565
Alpha: 100.000, RMSE (train): 67979.764, RMSE (test): 69505.948
Alpha: 1000.000, RMSE (train): 68119.554, RMSE (test): 69606.523
Best alpha: 100.000, Best RMSE: 69505.948


In [195]:
ridge = Ridge(alpha=best_alpha)
ridge.fit(X_train, y_train)

Ridge(alpha=100)

In [202]:
print(f'Best alpha: {best_alpha:.3f}, Best RMSE: {best_rmse:.3f}')

Best alpha: 100.000, Best RMSE: 69505.948


# Decision Tree

In [183]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [186]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [187]:
# Fit a decision tree on the training set
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(random_state=42)

In [188]:
# Make predictions on the training set and calculate RMSE and R2 score
y_train_pred = regressor.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)

In [189]:
# Make predictions on the test set and calculate RMSE and R2 score
y_test_pred = regressor.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

In [190]:
# Print the results
print("Decision Tree Regressor - Training set RMSE: ", train_rmse)
print("Decision Tree Regressor - Training set R2 score: ", train_r2)
print("Decision Tree Regressor - Test set RMSE: ", test_rmse)
print("Decision Tree Regressor - Test set R2 score: ", test_r2)

Decision Tree Regressor - Training set RMSE:  0.0
Decision Tree Regressor - Training set R2 score:  1.0
Decision Tree Regressor - Test set RMSE:  67249.75321575013
Decision Tree Regressor - Test set R2 score:  0.6589107249332604
