# Getting Started

In [12]:
import pandas as pd
import numpy as np

In [13]:
import os
import sys
path = os.getcwd()

In [14]:
components_dir = os.path.join(path, '..', 'src/components')

sys.path.append(components_dir)

In [15]:
from data_transformation import CombinedAttributesAdder

In [16]:
filepath="../data/preprocessed_data/train.csv"

In [17]:
df=pd.read_csv(filepath)

In [18]:
housing = df.drop("median_house_value", axis=1)
housing_labels = df["median_house_value"].copy()

## Importing the preprocessing pipeline

In [19]:
import joblib
import os
model_path = os.path.join('D:\\Developer\\End-to-End-Ml-Project', 'models\\preprocessing_pipeline.pkl')
model_path

'D:\\Developer\\End-to-End-Ml-Project\\models\\preprocessing_pipeline.pkl'

In [20]:
full_pipeline = joblib.load(filename=model_path)

In [21]:
housing_prepared=full_pipeline.fit_transform(housing)

# Select and Train Model

In [24]:
from sklearn.linear_model import LinearRegression
lin_reg=LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)

In [None]:
housing_prepared.head()

In [25]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))

Predictions: [ 86642.88874971 306627.76368999 146026.47998315 187496.48996692
 253241.50376191]
Labels: [72100.0, 279600.0, 82700.0, 112500.0, 238300.0]


In [26]:
from sklearn.metrics import mean_squared_error
house_predictions=lin_reg.predict(housing_prepared)
lin_mse=mean_squared_error(housing_labels,house_predictions)
np.sqrt(lin_mse)

68474.46303058461

When the rmse is too high that means the model is underfitting.

In [28]:
from sklearn.tree import DecisionTreeRegressor
tree_reg=DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)

In [30]:
tree_house_predictions=tree_reg.predict(housing_prepared)
tree_mse=mean_squared_error(housing_labels,tree_house_predictions)
np.sqrt(tree_mse)

0.0

When the rmse is too low that means the model overfit the data

# Better Evaluation Using cross validation score

In [32]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(tree_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
tree_rmse_scores=np.sqrt(-scores)

In [33]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [35]:
display_scores(tree_rmse_scores)

Scores: [69966.65511691 65722.09566985 69172.01169973 72827.80917599
 65398.2198178  71492.19745257 70474.45000749 69691.84802805
 66185.61703663 70154.98769788]
Mean: 69108.58917028984
Standard deviation: 2394.4549930014255


In [38]:
lin_scores=cross_val_score(lin_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
lin_rmse_scores=np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [71592.77831405 63869.9577383  67556.38442408 68519.24351418
 66746.81768539 72388.71814207 73883.0098497  68663.45479953
 66206.83198372 70132.29220787]
Mean: 68955.94886589076
Standard deviation: 2914.390896274287


In [41]:
from sklearn.ensemble import RandomForestRegressor
forest_reg=RandomForestRegressor()
forest_reg.fit(housing_prepared,housing_labels)

In [42]:
forest_predictions=forest_reg.predict(housing_prepared)
forest_rmse=mean_squared_error(forest_predictions,housing_labels)
np.sqrt(forest_rmse)

18157.897173977326

In [43]:
forest_scores=cross_val_score(forest_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
forese_rmse_scores=np.sqrt(-forest_scores)

NameError: name 'forset_rmse_scores' is not defined

In [46]:
display_scores(forese_rmse_scores)

Scores: [49454.35725818 47470.38490068 45499.97153588 50824.17291494
 46024.32804806 50519.99841876 50431.83647843 48531.44921539
 47007.10690317 52481.92433615]
Mean: 48824.55300096266
Standard deviation: 2175.920189807049


# Fine - Tune - Model

## Gridsearch

In [47]:
from sklearn.model_selection import GridSearchCV

In [48]:
param_grid = [
 {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
 {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
 ]


In [49]:
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
 scoring='neg_mean_squared_error',
return_train_score=True)


In [50]:
grid_search.fit(housing_prepared, housing_labels)

In [53]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [52]:
grid_search.best_estimator_

In [55]:
feature_importances=grid_search.best_estimator_.feature_importances_
feature_importances

array([5.32639686e-02, 6.06254631e-02, 5.22604836e-02, 3.98849355e-02,
       1.41327575e-02, 1.20688519e-02, 1.33764542e-02, 1.24429902e-02,
       3.54339552e-01, 4.47688615e-02, 1.07205786e-01, 6.03942671e-02,
       7.18211280e-03, 1.63114676e-01, 9.59699999e-05, 2.24081019e-03,
       2.60205917e-03])

In [64]:
df.columns

Index(['Unnamed: 0', 'longitude', 'latitude', 'housing_median_age',
       'total_rooms', 'total_bedrooms', 'population', 'households',
       'median_income', 'median_house_value', 'ocean_proximity'],
      dtype='object')

In [65]:
num_attribs=['Unnamed: 0', 'longitude', 'latitude', 'housing_median_age',
       'total_rooms', 'total_bedrooms', 'population', 'households',
       'median_income']

In [66]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.3543395523002384, 'median_income'),
 (0.1631146764874074, 'INLAND'),
 (0.10720578582471645, 'pop_per_hhold'),
 (0.060625463126353175, 'longitude'),
 (0.060394267067643254, 'bedrooms_per_room'),
 (0.05326396861711455, 'Unnamed: 0'),
 (0.052260483646787086, 'latitude'),
 (0.04476886152037751, 'rooms_per_hhold'),
 (0.0398849354891151, 'housing_median_age'),
 (0.014132757515898382, 'total_rooms'),
 (0.013376454236253079, 'population'),
 (0.012442990150023465, 'households'),
 (0.012068851863171562, 'total_bedrooms'),
 (0.007182112798055325, '<1H OCEAN'),
 (0.002602059168485796, 'NEAR OCEAN'),
 (0.0022408101884188627, 'NEAR BAY'),
 (9.596999994069738e-05, 'ISLAND')]

In [67]:
final_model=grid_search.best_estimator_

In [68]:
model_path="../models/ml_model.pkl"

In [70]:
import joblib
joblib.dump(final_model,model_path)

final_model = joblib.load(model_path)

# Evaluatiing system on test Dataset

In [72]:
test_filepath="../data/preprocessed_data/test.csv"
test=pd.read_csv(test_filepath)

In [78]:
X_test=test.drop("median_house_value",axis=1)
y_test=test['median_house_value']

In [79]:
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse) 

In [80]:
final_rmse

65218.4648750302