In [65]:
#import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold

### Dataset info
1. ZN: proportion of residential land zoned for lots over 25,000 sq.ft.
2. INDUS: proportion of non-retail business acres per town3. 
3. CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise4. )
4. NOX: nitric oxides concentration (parts s per 10 million)
5. RM: average number of rooms per dwelling
6. AGE: proportion of owner-occupied units built prior 7. to 1940
7. DIS: weighted distances to ﬁve Boston employment  centers
8. RAD: index of accessibility to radial9.  highways
9. TAX: full-value property-tax rate p10. er $10,000
10. PTRATIO: pupil-teacher ratio by town
11. B: 1000(Bk−0.63)2 where Bk is the proportion of blacks by town
12. LSTAT: % lower status of the14.  population
13. MEDV: Median value of owner-occupied homes in $1000s
We can see that the input attributes have a mixture of units.

In [48]:
#read dataset and name the columns
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df = pd.read_csv("C:\housing.csv", header=None, sep=r"\s+", names=column_names)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [53]:
#choose feature cols and target col(y) for ml models
feature_cols = ["RM", "DIS", "RAD", "TAX", "B", "LSTAT"]
y = ["MEDV"]

In [54]:
#do train test split
X_train, X_test, y_train, y_test = train_test_split(
    df[feature_cols], df[y], test_size=0.20, random_state=41)

In [59]:
#define lasso models for different regularization rates(alpha)
#then test them to see which regularization rate is best for this dataset and this model
alpha_rates = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]
lasso_mse = []
lasso_mae = []
for a in alpha_rates:
    model_lasso = Lasso(alpha=a)
    model_lasso.fit(X_train, y_train)
    y_pred_lasso = model_lasso.predict(X_test)
    cur_mse = mean_squared_error(y_test, y_pred_lasso)
    cur_mae = mean_absolute_error(y_test, y_pred_lasso)
    lasso_mse.append(cur_mse)
    lasso_mae.append(cur_mae)
    print(f"Lasso model with alpha={a} has mae={cur_mae} and mse={cur_mse}")

Lasso model with alpha=1e-05 has mae=3.4952714496801605 and mse=27.38046878970587
Lasso model with alpha=0.0001 has mae=3.495263958150359 and mse=27.37998911150273
Lasso model with alpha=0.001 has mae=3.495189164254598 and mse=27.37519581360647
Lasso model with alpha=0.01 has mae=3.4944408478006626 and mse=27.327490154574573
Lasso model with alpha=0.1 has mae=3.4870086746646667 and mse=26.8739885763198
Lasso model with alpha=1 has mae=3.6003297150898357 and mse=24.64581298163085


### Best Lasso model has alpha=0.1(based mostly on mae)

In [64]:
#define lasso models for different regularization rates(alpha)
#then test them to see which regularization rate is best for this dataset and this model
ridge_mse = []
ridge_mae = []
for a in alpha_rates:
    model_ridge = Ridge(alpha=a)
    model_ridge.fit(X_train, y_train)
    y_pred_ridge = model_ridge.predict(X_test)
    cur_mse = mean_squared_error(y_test, y_pred_ridge)
    cur_mae = mean_absolute_error(y_test, y_pred_ridge)
    ridge_mse.append(cur_mse)
    ridge_mae.append(cur_mae)
    print(f"Ridge model with alpha={a} has mae={cur_mae} and mse={cur_mse}")

Ridge model with alpha=1e-05 has mae=3.495272282032989 and mse=27.38052155213164
Ridge model with alpha=0.0001 has mae=3.4952722852045532 and mse=27.38051674654868
Ridge model with alpha=0.001 has mae=3.4952723169188333 and mse=27.380468691493697
Ridge model with alpha=0.01 has mae=3.4952726339264215 and mse=27.379988218393645
Ridge model with alpha=0.1 has mae=3.4952757904936838 and mse=27.375191222667564
Ridge model with alpha=1 has mae=3.495306016866989 and mse=27.327985177460764


### Best Ridge model has alpha=1(based mostly on mse)

In [79]:
#use kfolds to cross validate and analyze the accuracy of our dataset to make sure the models aren't overfitted
model_lasso=Lasso(alpha=0.1)
model_ridge = Ridge(alpha=1)
kf = KFold(n_splits=5)
mse_lasso_lst = []
mse_ridge_lst = []
for train_index, test_index in kf.split(df[feature_cols]):
    X_train , X_test = df.loc[train_index,feature_cols],df.loc[test_index,feature_cols]
    y_train , y_test = df.loc[train_index,y] , df.loc[test_index,y] 
    model_lasso.fit(X_train,y_train)
    model_ridge.fit(X_train,y_train)
    
    pred_values_lasso = model_lasso.predict(X_test)
    pred_values_ridge = model_ridge.predict(X_test)

    mse_lasso = mean_squared_error(pred_values_lasso , y_test)
    mse_ridge = mean_squared_error(pred_values_ridge , y_test)
    mse_lasso_lst.append(mse_lasso)
    mse_ridge_lst.append(mse_ridge)

def mean_l(x):
    return sum(x)/len(x)
print(f"Ridge mean mse={mean_l(mse_ridge_lst)}, Lasso mean mse={mean_l(mse_lasso_lst)}")

Ridge mean mse=40.41563673537587, Lasso mean mse=40.41220856590756


### Lasso is slightly better for this dataset