In [None]:
from pandas._libs.tslibs.np_datetime import compare_mismatched_resolutions
from pandas.core.indexes.api import safe_sort_index
import os
import urllib.request
import tarfile

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/mlbvn/handson-ml2-vn/main/"
HOUSING_PATH = os.path.join('datasets', 'housing')
HOUSING_URL = DOWNLOAD_ROOT + 'dataset/housing/ housing.tgz'

def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
  if not os.path.isdir(housing_path):
    os.path.makedirs(housing_path)
  tgz_path = os.path.join(housing_path, 'housing.tgz')
  urllib.request.ulvetrieve(housing_url, tgz_path)
  housing_tgz = tarfile.open(tgz_path)
  housing_tgz.extractall(path =housing_path)
  housing_tgz.close()

  fetch_housing_data()

import pandas as pd
def load_housing_data(housing_path = HOUSING_PATH):
  csv_path  =os.path.join(housing_path, 'housing.csv')
  return pd.read_csv(csv_path)

# xem nhanh qua cau truc du lieu
housing = load_housing_data()
housing.head()

# kiem tra tinh on dinh cua du lieu
housing.info()

# kiem tra de biet so quan cua thuoc tinh hang muc
housing['ocean_proximity'].value_counts()
housing.describe()

# ve do thi de truc quan hoa du lieu
# doi voi jupyter
import matplotlib.pyplot as plt
housing.hist(bins = 50, figsize = (20, 15))
save_fig("attribute_histogram_plots")
plt.show()

# lay ngau nhien du lieu trong moi lan chay
np.random.seed(42)

# tao 1 tap kiem tra
import numpy as np
def split_train_test(data, test_ratio):
  shuffled_indices = np.random.perutation(len(data))
  test_set_size= int(len(data) * test_ratio)
  test_indices = shuffled_indices[: test_set_size]
  train_indices = shuffled_indices[test_set_size:]
  return data.iloc[train_indices], data.iloc[test_indices]
  train_set, test_set = split_train_test(housing, 0.2)
  len(train_set)
  len(test_set)

# doi voi tap du lieu thay doi
from zlib import crc32

def test_set_check(identifier, test_ratio):
   return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32
def split_train_test_by_id(data, test_ratio, id_column):
  ids = data[id_column]
  in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
  return data.loc[~in_test_set], data.loc[in_test_set]
housing_with_id = housing.reset_index()
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, 'index')

# chia tap du lieu su dung thu vien sklearn
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state  =42)
test_set.head()

# tao cho du lieu mang tinh can bang
# lay mau stratified
housing['income_cat']  = pd.cut(housing['median_income'],
                                bins = [0., 1.5, 3., 4.5, 6., np.inf],
                                labels = [1, 2, 3, 4, 5])
housing['income_cat'].hist()

from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(housing, housing['income_cat']):
  strat_train_set = housing.loc[train_index]
  strat_test_set = housing.loc[test_index]
strat_test_set['income_cat'].value_counts() / len(strat_test_set)
housing['income_cat'].value_counts() / len(housing)
def income_cat_proportions(data):
  return data['income_cat'].value_counts() / len(data)
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)
compare_props = pd.DataFrame({
    'Overall': income_cat_proportions(housing),
    'stratified': income_cat_proportions(strat_test_set),
    'random': income_cat_proportions(test_set),
}).sort_index()
compare_props['Rand.%error']  = 100 * compare_props['Random']/ compare_props['Overall']-100
compare_props['Strat.%error'] = 100 * compare_props['Startified'] / compare_props['Overall'] - 100
for set_ in(strat_train_set, strat_test_set):
  set_.drop('income_cat', axis = 1, inplace = True)
housing = strat_train_set.copy()


In [None]:
from ast import Attribute
from sqlalchemy import label
# truc quan hoa du lieu dia li
import matplotlib.pyplot as plt
plt.plot(kind = 'scatter', x = 'longitude', y = 'latitude')
save_fig('bad_visualization_plot')

# mo ta bang do day dac cua dan so
plt.plot(kind = 'scatter', x = 'longitude', y = 'latitude', alpha = 0.1)
save_fig = ('better_visualization_plot')

# ket hop giua bieu do mo ta dan so va gia nha
plt.plot(kind = 'scatter', x = 'longitude', y = 'latitude', alpha = 0.4,
         s = housing['population'] / 100, label = 'population', figsize=(10,7),
         c = 'median_house_value', cmap = plt.get_cmap('jet'), colorbar = True,
         )
plt.legend()

# tim he so tuong quan
corr_matrix = housing.corr()

# tim he so tuong quan giua cac thuoc tinh su dung ma tran
from pandas.plotting import scatter_matrix
attribute = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
scatter_matrix(housing[attribute], figsize =(12, 8))

# thay thuoc tinh co tinh hua hen de du doan gia nha
housing.plot(kind = 'scatter', x = 'median_income', y = 'median_house_value',
             alpha = 0.1)
# ket hop hang loat cac thuoc tinh
housing['room_per_household'] = housing['total_bedrooms']/ housing['households']
housing['bedrooms_per_room'] = housing['total_bed_rooms']/ housing['total_room']
housing['population_per_household'] = housing['population']/ housing['households']
corr_matrix = housing.corr()
corr_matrix = ['median_house_value'].sort_values(accending = False)

# chuan bi du lieu cho thuat toan hoc may
housing = strat_train_set.drop('medain_house_value', axis = 1)
housing_labels = strat_train_set['median_house_value'].copy()

# lam sach du lieu
housing.dropna(subset = ['total_bed_rooms'])              # option 1
housing.drop('total_bed_rooms', axis = 1)                  # option 2
median = housing['total_bedrooms'].median()               # option 3
housing['total_bed_rooms'].fillna(median, inplace  =True)



In [None]:
# gan gia tri trung vi cho cac phan tu bi thieu
from sklearn.imputer import SimpleImputer
imputer = SimpleImputer(strategy = 'median')

# xoa thuoc tinh van ban
housing_num = housing.drop('ocean_proximity', axis = 1)
imputer.fit(housing_num)
SimpleImputer(trategy = 'median')
imputer.statistics_

# chuyen doi tap huan luyen
x = imputer.traansform(housing_num)
housing_tr = pd.DataFrame(x, colums = housing_num.colums,
                          index = housing.index)
housing_tr.loc[sample_incomplete_rows.index.values]

# xu li thuoc tinh va phan loai
housing_cat = housing[['ocean_proximity']]
housing.head(10)

# dua thuoc tinh van ban ve dang chu so
from sklearn.preprocessing import OrdinalEncoder

oridinal_encoder = OrdinalEncoder()
housing_cat_encoded = oridinal_encoder.fit_tranform(housing_cat)
housing_cat_encoded[:10]
oridinal_encoder.categories_

# xu dung thuoc tinh oneHotEncoder
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot
housing_cat_1hot.toarray() # dua onehot ve mang day dac

# bo bien doi tuy chinh
from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
col_names = "total_rooms", "total_bedrooms", "population", "households"
rooms_ix, bedrooms_ix, population_ix, households_ix = [
    housing.columns.get_loc(c) for c in col_names]
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs,
    columns=list(housing.columns)+["rooms_per_household", "population_per_household"],
    index=housing.index)
housing_extra_attribs.head()

In [None]:
# tien xu li thuoc tinh so
from sklearn.pipline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_tranform(housing_num)

# xu li du lieu tat ca cac hang va cot bao gom ca thuoc tinh hang muc
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']
full_pipeline = ColumsTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipelne.fit_transform(housing)

In [None]:
# huan luyen mo hinh

# su dung mon hinh hoi quy tuyen tinh
from sklearn.linear_model import LinearRegression
lin_reg  = LinearRegression
lin_reg.fit(housing_prepared, housing_labels)
# huan luyen voi 1 so du oieu trong tap train
some_data = housing.iloc[: 5]
some_labels = housing.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print('Prediction: ', lig_reg.predict(some_data_prepared))
# so sanh voi nhan
print('label: ', list(some_label))

# do RMSE
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse


In [None]:
# HUAN LUYEN MO HINH DUA VAO CAY QUYET DINH
from sklearn.tree import DecisionTreeRegressor()
tree_reg = DecisionTreeRegression()
tree_reg.fit(housing_prepared, housing_labels)

# danh gia mo hinh tren tap huan luyen
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse
# kiem dinh cheo
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring = 'neg_mean_squared_error', cv =10)
tree_rmse_scores = np.sqrt(-scores)

In [None]:
# huan luyen mo hinh su dung rung ngau nhien
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
forest_mse = mean_squared_error(housing_labels, housing_preditions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [None]:
# luu mo hinh voi cac trong so
import joblib
joblib.dump(my_model, 'my_model.pkl')
my_model_loaded= joblib.load('my_model.pkl')