# ML Pipeline

In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRFRegressor
from sklearn.metrics import mean_squared_error

In [30]:
df=pd.read_csv("HousingData.csv")

In [31]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.60762,11.20751,11.124032,0.06917,0.554695,6.284634,68.637747,3.795043,9.549407,408.237154,18.455534,356.674032,12.750652,22.532806
std,8.591007,23.383171,6.864808,0.253994,0.115878,0.702617,28.16229,2.10571,8.707259,168.537116,2.164946,91.294864,7.285127,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082757,0.0,5.19,0.0,0.449,5.8855,45.175,2.100175,4.0,279.0,17.4,375.3775,7.0375,17.025
50%,0.266005,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.395,21.2
75%,3.677082,12.5,18.1,0.0,0.624,6.6235,93.975,5.188425,24.0,666.0,20.2,396.225,16.93,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,43.0,50.0


In [32]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [33]:
X=df.drop(['MEDV'], axis=1)
y=df['MEDV']

In [34]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3, random_state=4)

In [35]:
X_cols=X_train.columns


# Scaling the data

In [36]:
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X_train)
scaled_df=pd.DataFrame(X_scaled, columns=X_cols)
scaled_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.429044,-0.463008,-0.954695,-0.231455,-0.919581,0.2151,-0.745291,0.454022,-0.764468,-0.976012,0.00546,0.441889,-0.45175
1,-0.429897,3.049085,-1.329987,-0.231455,-1.227311,-0.883652,-1.687061,3.163428,-0.651568,-0.464548,1.616046,0.287498,-0.666204
2,-0.388841,-0.463008,-0.705955,4.320494,-0.423795,-0.125423,0.817111,-0.353904,-0.199967,-0.623278,-0.500725,0.423713,1.165531
3,-0.253177,-0.463008,-0.423759,-0.231455,-0.158805,-0.228336,1.019176,-0.021755,-0.651568,-0.623278,1.155878,-1.185126,1.012545
4,-0.369633,0.415015,-1.030335,-0.231455,0.157472,3.102729,-0.059711,-0.646202,-0.538668,-0.876071,-2.525462,0.306551,-0.749527


In [37]:
X_train.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
84,0.05059,0.0,4.49,0,0.449,6.389,48.0,4.7794,3,247,18.5,396.9,9.62
354,0.04301,80.0,1.91,0,0.413,5.663,21.9,10.5857,4,334,22.0,382.8,8.05
221,0.40771,0.0,6.2,1,0.507,6.164,91.3,3.048,8,307,17.4,395.24,21.46
34,1.61282,0.0,8.14,0,0.538,6.096,96.9,3.7598,4,307,21.0,248.31,20.34
267,0.57834,20.0,3.97,0,0.575,8.297,67.0,2.4216,5,264,13.0,384.54,7.44


### Fit the model on scaled data

In [38]:
ridge = Ridge()
ridge.fit(scaled_df, y_train)

Ridge()

In [39]:
ridge.predict(X_test)

array([ -451.76101499,  -284.28331155,  -451.16704345,  -357.87437699,
        -347.35445783,  -351.27344122,   -55.93308299, -1161.98579476,
       -1382.4541158 ,  -674.11708122,  -426.92362212,  -148.50872732,
        -407.6407934 ,  -622.75083233,  -370.60398878, -1552.1095641 ,
        -379.61468594, -1264.50448916, -1383.75410904,  -495.84564044,
       -1621.11237517,  -736.59083498,  -155.05928158,  -559.71248785,
        -494.07893227,  -562.89118318,  -358.03970398,  -704.90546119,
       -1431.46828022,  -579.84917666, -1466.47950192,  -295.59034758,
        -244.80056884,  -126.81477686, -1172.14141627, -1182.91692606,
        -270.29996927,  -340.3455672 ,  -277.39689832,  -341.59098213,
        -329.12603249,  -597.33417891, -1241.6907701 ,  -185.08634975,
        -361.30708078,  -337.17247422,  -117.93839288,  -248.78501618,
       -1155.35216958,  -302.14867099,  -102.03365322,  -655.20366013,
       -1201.8425542 ,  -305.65920038,  -832.85915656,  -652.22390312,
      

In [40]:
np.sqrt(mean_squared_error(ridge.predict(X_test), y_test))

751.5436370779464

In [41]:
# while scaling data, ALWAYS do fit_transform or fit transform on train data, and predict on transformed test data
ridge.predict(scaler.transform(X_test))

array([11.84364148, 26.69638239, 17.57180703, 19.70629745, 35.5782233 ,
       25.03541998, 31.15452945, 20.17337945, 20.64013111, 24.76627016,
       28.44403357, 28.22193486, 19.08801798, 33.26757033, 21.48141609,
       15.29253593, 20.97007322, 12.331847  , 11.51530049, 13.88897421,
        5.191639  , 17.45722113, 20.34541602, 22.54391485, 16.39583163,
       20.46045494, 19.45308763, 14.43963696, 20.95637191, 17.68515734,
       14.74730084, 23.54528296, 33.82429199, 21.83539322, 17.71711357,
       20.63757059, 30.24062577, 34.01576488, 23.71094562, 24.36401783,
       36.29900561, 31.38084983, 19.73728184, 31.48803004, 34.71219154,
       25.4586778 , 39.97013845, 17.88809719, 20.2080626 , 28.45502604,
       33.40474557, 25.27462953, 18.73123701, 27.36165731, 13.33153568,
       23.21489274, 24.46043704, 33.19312841, 17.01235828, 37.91632261,
       15.87312833, 19.54744941, 31.67674653, 15.29887839, 38.32703214,
       27.47200059, 34.36793419,  9.99640582, 19.61814175, 22.24

In [42]:
np.sqrt(mean_squared_error(ridge.predict(scaler.transform(X_test)), y_test))

5.533076763080018

# ML Pipeline

In [43]:
from sklearn.pipeline import Pipeline

In [44]:
# the pipeline takes care of the scaling part internally
pipeline_XGB=Pipeline(
[
    ('scaler', StandardScaler()),
    ('model', XGBRFRegressor())
]
)


In [45]:
pipeline_XGB.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 XGBRFRegressor(base_score=0.5, booster='gbtree',
                                colsample_bylevel=1, colsample_bytree=1,
                                enable_categorical=False, gamma=0, gpu_id=-1,
                                importance_type=None,
                                interaction_constraints='', max_delta_step=0,
                                max_depth=6, min_child_weight=1, missing=nan,
                                monotone_constraints='()', n_estimators=100,
                                n_jobs=8, num_parallel_tree=100,
                                objective='reg:squarederror', predictor='auto',
                                random_state=0, reg_alpha=0, scale_pos_weight=1,
                                tree_method='exact', validate_parameters=1,
                                verbosity=None))])

In [46]:
pipeline_XGB.score(X_test, y_test)

0.825491117344465

In [47]:
pipeline_XGB.predict(X_test)

array([17.499262 , 24.489866 , 20.394697 , 16.976543 , 44.765247 ,
       23.172613 , 34.925148 , 17.030613 , 16.612011 , 15.984259 ,
       27.867796 , 25.40899  , 20.696114 , 23.950773 , 21.420769 ,
       13.875251 , 20.709328 , 11.8970785, 14.598286 , 15.4360695,
        7.942722 , 15.667612 , 20.19945  , 20.95823  , 21.134842 ,
       20.999983 , 17.411026 , 15.84741  , 20.99398  , 18.897257 ,
       13.970251 , 22.995417 , 33.33474  , 22.576262 , 14.667857 ,
       13.357585 , 30.320858 , 42.94158  , 23.333706 , 22.746521 ,
       47.00133  , 29.336946 , 13.24968  , 28.728289 , 28.734552 ,
       20.822725 , 49.434635 , 18.03025  , 21.020653 , 22.303976 ,
       29.035036 , 21.456371 , 12.406068 , 26.777029 , 15.421854 ,
       21.069775 , 25.34574  , 30.114408 , 20.248762 , 29.02377  ,
       17.640156 , 20.974066 , 27.56376  , 20.686312 , 47.139843 ,
       26.842985 , 28.958607 ,  8.926995 , 19.606512 , 21.575066 ,
       21.230038 , 21.047062 , 25.804817 , 26.599344 , 17.4102

# Hyperparameter tuning using Gridserach on single model

In [49]:
from sklearn.model_selection import GridSearchCV

In [50]:
pipeline_ridge = Pipeline(
[
    ('scaler', StandardScaler()),
    ('model', Ridge())
]
)

In [52]:
cv = GridSearchCV(
    pipeline_ridge, 
    param_grid = {
        "model__alpha" : [0.001, 0.01, 0.1, 1, 10, 100, 1000]
        
    },
    cv=4
)

In [54]:
cv.fit(X_train, y_train)

GridSearchCV(cv=4,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', Ridge())]),
             param_grid={'model__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]})

In [55]:
cv.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()), ('model', Ridge(alpha=1))])

In [56]:
cv.score(X_test, y_test)

0.7068098326584312

In [57]:
np.sqrt(mean_squared_error(cv.best_estimator_.predict(X_test), y_test))

5.533076763080018

# Hyperparameter tuning using Gridserach on multiple models

In [58]:
#initialize the pipeline
pipe = Pipeline(
[
    ("regressor" , RandomForestRegressor())
]
)

# set up the parameters for all models
grid_param = [
    {
        "regressor" : [RandomForestRegressor()],
        "regressor__n_estimators" : [10, 100, 150],
        "regressor__max_depth" : [5, 10, 15, 20]
        
    },
    
    {
        "regressor" : [XGBRFRegressor()],
        "regressor__min_child_weight" : [1, 5, 10],
        "regressor__gamma" : [0.5, 1, 1.5, 2, 5],
        "regressor__subsample" : [0.6, 0.8, 1.0],
        "regressor__colsample_bytree" : [0.6, 0.8, 1.0],
        "regressor__max_depth" : [3, 4, 5]
        
    },
    
    {
        "regressor" : [Ridge()],
        "regressor__alpha" : [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    }
    
    
]

In [59]:
gridSearch = GridSearchCV(pipe, grid_param, cv=4)
bestModel = gridSearch.fit(X_train, y_train)

In [60]:
bestModel.best_estimator_

Pipeline(steps=[('regressor', RandomForestRegressor(max_depth=10))])

In [61]:
bestModel.score(X_test, y_test)

0.8223721098192155

In [62]:
np.sqrt(mean_squared_error(cv.best_estimator_.predict(X_test), y_test))

5.533076763080018