In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style = 'whitegrid'

In [3]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing() #assigning the dataset to a variable
df = pd.DataFrame(data.data, columns=data.feature_names) #dataframing the imported dataset
df["target"] = pd.Series(data.target) #converting the target column and titling it "target"

print(df.shape)
df.head()

(20640, 9)


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(["target"], axis=1),
    df["target"],
    test_size=0.2,
    random_state=101
)

print(
    "*Train set: ",
    X_train.shape,
    y_train.shape,
    "\n*Test set: ",
    X_test.shape,
    y_test.shape
)

*Train set:  (16512, 8) (16512,) 
*Test set:  (4128, 8) (4128,)


In [17]:
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LinearRegression

def pipeline_linear_regression():
    pipeline = Pipeline(
        [
            ("feat_scaling", StandardScaler()),
            ("feat_selection", SelectFromModel(LinearRegression())),
            ("ml_model", LinearRegression())
        ]
    )
    return pipeline

pipeline_linear_regression()


0,1,2
,steps,"[('feat_scaling', ...), ('feat_selection', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,estimator,LinearRegression()
,threshold,
,prefit,False
,norm_order,1
,max_features,
,importance_getter,'auto'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [18]:
pipeline = pipeline_linear_regression()
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('feat_scaling', ...), ('feat_selection', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,estimator,LinearRegression()
,threshold,
,prefit,False
,norm_order,1
,max_features,
,importance_getter,'auto'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [19]:
def linear_model_coefficients(ml_model, columns):
    """
    This helps print the coefficients of the linear regression model.
    
    model: the trained linear model object
    
    columns: the column names corresponding to the coefficients
    
    """
    print(f"* Interception: {ml_model.intercept_}")
    coeff_df = pd.DataFrame(ml_model.coef_, columns, columns=["Coefficient"]).sort_values(
        ["Coefficient"], key=abs, ascending=False
        )

    print("* Coefficients")
    print(coeff_df)


In [22]:
pipeline["ml_model"].get_params()

{'copy_X': True,
 'fit_intercept': True,
 'n_jobs': None,
 'positive': False,
 'tol': 1e-06}

In [25]:
pipeline["feat_selection"].get_support()

array([ True, False, False, False, False, False,  True,  True])

In [26]:
X_train.columns[pipeline["feat_selection"].get_support()]

Index(['MedInc', 'Latitude', 'Longitude'], dtype='object')

In [27]:
linear_model_coefficients(
    ml_model = pipeline["ml_model"],
    columns = X_train.columns[pipeline["feat_selection"].get_support()]
)

* Interception: 2.0660510895106525
* Coefficients
           Coefficient
Latitude     -1.038871
Longitude    -1.003760
MedInc        0.693899


In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

def regression_performance(X_train, y_train, X_test, y_test, pipeline):
    print("Model Evaluation \n")
    print("*Train Set")
    regression_evaluation(X_train, y_train, pipeline)
    print("* Test Set")
    regression_evaluation(X_test, y_test, pipeline)

def regression_evaluation(X, y, pipeline):
    prediction = pipeline.predict(X)
    print("R2 Score: ", r2_score(y, prediction), 3)
    print("Mean Absolute Error: ", mean_absolute_error(y, prediction),3)
    print("Mean Squared Error: ", mean_squared_error(y, prediction), 3)
    print("Root Mean Squared Error: ", np.sqrt(mean_squared_error(y, prediction)), 3)
    print("\n")

def regression_evaluation_plots(X_train, y_train, X_test, y_test, pipeline, 
                                alpha_scatter = 0.5):
    pred_train = pipeline.predict(X_train)
    pred_test = pipeline.predict(X_test)

    fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize = (10, 6))
    sns.scatterplot(x = y_train, y = pred_train, alpha = alpha_scatter, 
                    ax = axes[0])
    sns.lineplot(x = y_test, y = y_test, color = "red", ax = axes[0])
    axes[0].set_xlabel("Actual")
    axes[0].set_ylabel("Predicted")
    axes[0].set_title("Train Set")

    sns.scatterplot(x = y_test, y = pred_test, alpha = alpha_scatter, 
                    ax = axes[1])
    sns.lineplot(x = y_test, y = y_test, color = "red", ax = axes[1])
    axes[1].set_xlabel("Actual")
    axes[1].set_ylabel("Predicted")
    axes[1].set_title("Test Set")

    plt.show()

regression_performance(X_train, y_train, X_test, y_test, pipeline)
regression_evaluation_plots(X_train, y_train, X_test, y_test, pipeline, 
                            alpha_scatter = 0.5)


Model Evaluation 

*Train Set
R2 Score:  0.5871179233179742 3
Mean Absolute Error:  0.5457147032791132 3
Mean Squared Error:  0.5476623630599431 3
Root Mean Squared Error:  0.7400421360030408 3


* Test Set
R2 Score:  0.5729574907796264 3
Mean Absolute Error:  0.5549245599238322 3
Mean Squared Error:  0.5773080434275145 3
Root Mean Squared Error:  0.7598078990294287 3


