<a href="https://colab.research.google.com/github/nakulnaiwal/ML-Pipeline/blob/main/Housing_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraries

import numpy as np
import pandas as pd


In [None]:
#importing essentials

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
#loading the dataset

housing = pd.read_csv("Housing.csv")
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [None]:
#seprating labels and target

x = housing.drop("price",axis=1)
y = housing['price']

In [None]:
#sperating numeric and categorical features for column tranformation

num_feat = x.select_dtypes(include=['int64','float64']).columns
cat_feat = x.select_dtypes(include=['object']).columns

In [None]:
#creating a pipleline for num and cat

num_pipe = Pipeline([
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

cat_pipe = Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder())
])

In [None]:
#applying column transformer

preprocessor = ColumnTransformer([
    ('num',num_pipe,num_feat),
    ('cat',cat_pipe,cat_feat)
])

In [None]:
from sklearn.ensemble import RandomForestRegressor
#creating a pipeline

pipe = Pipeline([('preprocessor',preprocessor),
                 ('model',RandomForestRegressor())])

In [None]:
#applying cross validation

scores = cross_val_score(pipe,x,y,cv=5)
print("mean scores",scores.mean())

mean scores -11.77538141342238


In [None]:
#performing hyperparameter tuning

param_grid = {
    'model__n_estimators':[100,200],
    'model__max_depth':[None,10],
}

grid = GridSearchCV(pipe,param_grid,cv=5)
grid.fit(x,y)

print("bestparam",grid.best_params_)
print("bestscores",grid.best_score_)

bestparam {'model__max_depth': 10, 'model__n_estimators': 100}
bestscores -11.61665750788173


In [None]:
bestmodel = grid.best_estimator_