In [None]:
#load the data
import pandas as pd
housing = pd.read_csv(r'C:\Users\HP\Downloads/housing.csv')
housing.head()

In [None]:
housing.info()

In [None]:
housing.describe()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing["population"].value_counts()

In [None]:
#Data Visualization
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
#creating train and test set
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size = 0.2,
                                       random_state = 42)
print(len(train_set))
print(len(test_set))

In [None]:
#continuous->categorical
import numpy as np
housing["income_cat"] = pd.cut(housing["median_income"], bins = [0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
                              labels = [1, 2, 3, 4, 5])
housing["income_cat"].hist()

In [None]:
housing["housing_median_age_cat"] = pd.cut(housing["housing_median_age"], bins = [10, 20, 30, 40, 50, np.inf],
                              labels = [1, 2, 3, 4, 5])
housing["housing_median_age_cat"].hist()

In [None]:
#stratified train test data
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
#copying the train data set
housing = strat_train_set.copy()

In [None]:
#individual visualization(population, median_house_value)
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
 s=housing["population"]/100, label="population", figsize=(10,7),
 c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
)
plt.legend()

In [None]:
#(median_income, households)
housing.plot(kind="scatter", x="longitude", y="latitude",
 s=housing["median_income"], label="median_income", figsize=(10,7),
 c="households", cmap=plt.get_cmap("jet"), colorbar=True,
)
plt.legend()

In [None]:
#finding the correlations
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
corr_matrix["population"].sort_values(ascending=False)

In [None]:
corr_matrix["median_income"].sort_values(ascending=False)

In [None]:
#visualizing some scatter matrix
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms",
 "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

In [None]:
#using two new attribute (population, households)
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms",
 "population","households"]
scatter_matrix(housing[attributes], figsize=(12, 8))

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value",
 alpha=0.1)

In [None]:
housing.plot(kind="scatter", x="median_house_value", y="total_rooms",
 alpha=0.4)

In [None]:
housing.plot(kind="scatter", x="median_house_value", y="total_bedrooms",
 alpha=0.4)

In [None]:
housing.plot(kind="scatter", x="median_house_value", y="housing_median_age",
 alpha=0.1)

In [None]:
#preparing some more attribute for finding further correlation
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]
housing["bedrooms_per_households"] = housing["total_bedrooms"]/housing["households"]
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)


In [None]:
housing.plot(kind="scatter", x="median_house_value", y="rooms_per_household",
 alpha=0.1)

In [None]:
housing.plot(kind="scatter", x="median_house_value", y="bedrooms_per_room",
 alpha=0.1)

In [None]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing.head()

In [None]:
housing_labels = strat_train_set["median_house_value"].copy()
housing_labels.head()

# Data Cleaning

In [None]:
#for data cleaning we will set the null values with median values
median =  housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(median, inplace=True)

In [None]:
housing = strat_train_set.drop("housing_median_age_cat", axis=1)

In [None]:
housing.info()

In [None]:
#filling the null value of numerical attribute
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "median")
#Separating mumerical data
housing_num = housing.drop("ocean_proximity", axis  = 1)
imputer.fit(housing_num)

In [None]:
X = imputer.transform(housing_num)
#Numpy -> Pandas Data Frame
housing_tr = pd.DataFrame(X, columns = housing_num.columns)

In [None]:
#handling text attriute(ocean approximity)
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)

In [None]:
#categorical -> Numerical
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
housing_cat_1hot.toarray()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                        bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)


# Transformation Pipelines 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler #for feature scaling

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),
                        ('attribs_adder', CombinedAttributesAdder()),
                        ('std_scaler', StandardScaler())])
housing_num_tr = num_pipeline.fit_transform(housing_num)

handling both categorical and numerical attribute

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
                 ("num", num_pipeline, num_attribs),
                 ("cat", OneHotEncoder(), cat_attribs),])

housing_prepared = full_pipeline.fit_transform(housing)

# Data Training And Evaluating
1) Linear Regression
2) Decision Tree Regressor
3) Random Forest Regressor

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)

In [None]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))

In [None]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

In [None]:
print("Predictions:", tree_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)


In [None]:
print("Predictions:", forest_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))

In [None]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

# Cross Validation 

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [None]:
from sklearn.model_selection import cross_val_score
#tree regression cross validation
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
display_scores(tree_rmse_scores)

In [None]:
#linear regression cross validation
scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores)
display_scores(lin_rmse_scores)

In [None]:
#random forest regression cross validation
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

In [None]:
from sklearn.svm import SVR

svm_reg = SVR(kernel="linear")
svm_reg.fit(housing_prepared, housing_labels)
housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
print(svm_rmse)
print("Predictions:", svm_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))

In [None]:
svm_scores = cross_val_score(svm_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
svm_rmse_scores = np.sqrt(-forest_scores)
display_scores(svm_rmse_scores)

In [None]:
from sklearn.ensemble import RandomForestRegressor
#specifying n_estimator = 100, random_state = 42
forest_reg2 = RandomForestRegressor(n_estimators=100, random_state = 42)
forest_reg2.fit(housing_prepared, housing_labels)

In [None]:
housing_predictions = forest_reg2.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [None]:
forest_scores2 = cross_val_score(forest_reg2, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores2 = np.sqrt(-forest_scores2)
display_scores(forest_rmse_scores2)

In [None]:
pd.Series(np.sqrt(-forest_scores2)).describe()

# Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [
 {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
 {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
 ]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
 scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
pd.DataFrame(grid_search.cv_results_)

# Randomized Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

In [None]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
rnd_search.best_params_

In [None]:
rnd_search.best_estimator_

# Best model and error analysis

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

In [None]:
feature_importances = rnd_search.best_estimator_.feature_importances_
feature_importances

In [None]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]

cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

# Evaluating the system on test

In [None]:
strat_test_set.info()

In [None]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

# Exercise

In [None]:
#1
from sklearn.model_selection import GridSearchCV

param_grid = [
        {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    ]

svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(housing_prepared, housing_labels)

In [None]:
negative_mse = grid_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_