# Chapter 2

## descriptive statistics using numpy and matplotlib

In [None]:
import sklearn.datasets
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
# get California housing dataset
data = sklearn.datasets.fetch_california_housing()
X = data.data
y = data.target.reshape(-1, 1)

In [None]:
# scatter plot (latitude, longitude, median income, and cost)
plt.scatter(X[:, 6], X[:, 7], s=X[:, 0], c=y)
plt.colorbar();

In [None]:
# co-variance heat map
from matplotlib import cm
X_y = np.concatenate((X, y), axis=1)
cov = np.corrcoef(X_y.T)
ax = plt.matshow(abs(cov), cmap=cm.cividis)
plt.colorbar()

ticks = data.feature_names + data.target_names
plt.xticks(range(len(ticks)), ticks, rotation=90)
plt.yticks(range(len(ticks)), ticks);

In [None]:
# scatter matrix
n = X_y.shape[1]
fig, axes = plt.subplots(figsize=(n*2, n*2), nrows=n, ncols=n)
for i in range(n):
    for j in range(n):
        plt.sca(axes[i, j])
        if i==n-1: plt.xlabel(ticks[j])
        if j==0: plt.ylabel(ticks[i])
        if i==j:
            da = X_y[:, i]
            plt.hist(X_y[:, i], 
                     bins = np.arange(min(da), max(da), (max(da)-min(da))/20))
        else:
            if i>j:
                plt.scatter(X_y[:, i], X_y[:, j])
            else:
                plt.text(0, 0, f'{cov[i, j]:.2f}')
                plt.xlim([-1, 1])
                plt.ylim([-1, 1])
                plt.axis('off')

## descriptive statistics and visualisation using pandas

In [None]:
import pandas as pd

file = 'C:/Users/manav.singh/Documents/GitHub/learn_python/datasets/housing/housing.csv'
housing = pd.read_csv(file)

housing.info()
housing.describe()

In [None]:
housing.hist(figsize=(10, 10), bins=50);

In [None]:
pd.plotting.scatter_matrix(housing, figsize=(20, 20));

In [None]:
housing.index = housing.longitude * 1000 + housing.latitude

In [None]:
housing['income_cat'] = pd.cut(housing.median_income, bins = [0, 1.5, 3, 4.5, 6, np.inf], labels = range(1, 6))

In [None]:
housing.income_cat.hist();

In [None]:
housing.reset_index(inplace=True)
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

[[train_index, test_index]] = split.split(housing, housing.income_cat)

strat_train_test = housing.loc[train_index]
strat_test_set = housing.loc[test_index]

In [None]:
print(strat_test_set.income_cat.value_counts()/len(strat_test_set))
print(housing.income_cat.value_counts()/len(housing))

In [None]:
housing = strat_train_test.copy()

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude', s=housing.population/100, label='population', alpha=0.4,
            figsize=(10,7), c=housing.median_house_value, cmap=plt.get_cmap('jet'), colorbar=True)
plt.legend();

In [None]:
housing.drop('index', axis=1, inplace=True)
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
pd.plotting.scatter_matrix(housing[attributes], figsize=(12, 8));

## experimenting with attribute combinations

In [None]:
housing['rooms_per_household'] = housing.total_rooms / housing.households
housing['bedrooms_per_room'] = housing.total_bedrooms / housing.total_rooms
housing['population_per_household'] = housing.total_bedrooms / housing.households

corr_matrix = housing.corr()['median_house_value'].sort_values(ascending=False)
print(corr_matrix)

## prepare the data for ML algorithms
to allow reproduce results, library for further transformations, easily try various transformations

In [None]:
housing = strat_train_test.drop(['index', 'median_house_value'], axis=1)
housing_label = strat_train_test.median_house_value.copy()

## data cleaning
##### remove samples, remove attribute, or impute zero, mean, or median value

In [None]:
housing.dropna(axis=0).describe()

In [None]:
housing.dropna(axis=1).describe()

In [None]:
housing.total_bedrooms.fillna(housing.total_bedrooms.median())

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
housing_num = housing.drop(['income_cat', 'ocean_proximity'], axis=1)
imputer.fit(housing_num)

In [None]:
housing_num.median().values

In [None]:
imputer.statistics_

In [None]:
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)

## handling text and categorical attributes

In [None]:
housing_cat = housing[['ocean_proximity']]
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
housing_cat_encoded = encoder.fit_transform(housing_cat)
print(housing_cat_encoded[:10])
print(encoder.categories_)

In [None]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_one_hot = cat_encoder.fit_transform(housing_cat)
print(type(housing_cat_one_hot))

## custom transformers

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributes(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributes()
housing_extra_attribs = attr_adder.transform(housing.values)
print(housing_extra_attribs[0])

## feature scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), 
                        ('attribs_adder', CombinedAttributes()),
                        ('std_scalar', StandardScaler())
                        ])

housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_num_tr[0]

In [None]:
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([('num', num_pipeline, num_attribs), ('cat', OneHotEncoder(), cat_attribs)])
housing_prepared = full_pipeline.fit_transform(housing)

## select and train a model

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_label)

housing_predictions = lin_reg.predict(housing_prepared)

from sklearn.metrics import mean_squared_error as mse
error = mse(housing_label.values, housing_predictions)
print(np.sqrt(error))

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_label)

housing_tree_predictions = tree_reg.predict(housing_prepared)
tree_error = mse(housing_label.values, housing_tree_predictions)
print(np.sqrt(tree_error))

## better evaluation using cross-validation

In [None]:
from sklearn.model_selection import cross_val_score
tree_scores = cross_val_score(tree_reg, housing_prepared, housing_label, scoring='neg_mean_squared_error', cv=10)
tree_rmse = np.sqrt(-tree_scores)

def display_score(scores):
    print(f'Scores: {scores}\nMean: {scores.mean():.0f}\nStd: {scores.std():.0f}')
    
display_score(tree_rmse)

In [None]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_label, scoring='neg_mean_squared_error', cv=10)
display_score(np.sqrt(-lin_scores))

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_label, scoring='neg_mean_squared_error', cv=10)
display_score(np.sqrt(-forest_scores))

## fine tuning the model

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {
        'n_estimators': [3, 10],
        'max_features': [2, 4]
    },
    {
        'bootstrap': [False],
        'n_estimators': [3, 10],
        'max_features': [2, 3, 4]
    }
]

forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error',
                          return_train_score=True)

grid_search.fit(housing_prepared, housing_label)

In [None]:
cv_results = grid_search.cv_results_
for param, score in zip(cv_results['params'], cv_results['mean_test_score']):
    print(f'{param}: {np.sqrt(-score):.2f}')

In [None]:
grid_search.best_estimator_.feature_importances_

## Evaluating model on the test dataset

In [None]:
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop(['index', 'median_house_value'], axis=1)
y_test = strat_test_set['median_house_value']

X_test_prepared = full_pipeline.transform(X_test)
y_pred = final_model.predict(X_test_prepared)
test_rmse = sklearn.metrics.mean_squared_error(y_test, y_pred, squared=False)
print(test_rmse)

## Launch, monitor, and maintain system