<a href="https://colab.research.google.com/github/miuceo/ML_intro/blob/main/california_housing_MachineLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import sklearn

url = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(url)

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42)

X_train = train_set.drop("median_house_value", axis = 1)
y = train_set['median_house_value'].copy()

X_num = X_train.drop("ocean_proximity", axis = 1)

X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,NEAR OCEAN
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,INLAND


In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedroom_per_room = True):
        self.add_bedroom_per_room = add_bedroom_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]

        if self.add_bedroom_per_room:
            bedroom_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[rooms_per_household, population_per_household, bedroom_per_room]

        else :
            return np.c_[rooms_per_household, population_per_household]


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipline = Pipeline([
    ("median_imputer", SimpleImputer(strategy="median")),
    ("attributes_adder", CombinedAttributeAdder()),
    ("std_scaler", StandardScaler())
])

In [4]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ["ocean_proximity"]

full_pipline = ColumnTransformer([
    ("num", num_pipline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

X_prepared = full_pipline.fit_transform(X_train)
X_prepared[0:5, :]

array([[-0.17491646,  0.05137609, -0.2117846 ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.40283542, -0.11736222,  0.34218528,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [ 0.08821601, -0.03227969, -0.66165785,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.60001532,  0.07750687,  0.78303162,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [ 0.3490073 , -0.06883176, -0.55036364,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ]])

**Machine Learning**

In [5]:
### Linear Regression

from sklearn.linear_model import LinearRegression
LR_model = LinearRegression() # estimator - calculs

In [6]:
LR_model.fit(X_prepared, y)

In [7]:
test_Data = X_train.sample(5)
test_label = y.loc[test_Data.index]
test_label

Unnamed: 0,median_house_value
16059,339800.0
16701,178200.0
11895,82200.0
19397,121900.0
10549,137500.0


In [8]:
test_Data_prepared = full_pipline.transform(test_Data)
test_Data_prepared[0:5, :]

array([[-0.14121453, -0.05563216, -0.22426621,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.17271973, -0.07008442, -0.0749623 ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.19512234,  0.05263777, -0.1156011 ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.13657896,  0.03787422, -0.50363041,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [-0.44710624, -0.0646676 ,  0.71447062,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ]])

In [9]:
predicted_labels = LR_model.predict(test_Data_prepared)

In [10]:
pd.DataFrame({"Bashorat" : predicted_labels, "Asl qiymat" : test_label})

Unnamed: 0,Bashorat,Asl qiymat
16059,265743.706141,339800.0
16701,244768.359891,178200.0
11895,117183.734568,82200.0
19397,133895.872131,121900.0
10549,216691.241683,137500.0


**Modelni baholash**

In [11]:
X_test = test_set.drop('median_house_value', axis = 1)
y_test = test_set['median_house_value']

X_test_num = X_test.drop('ocean_proximity', axis = 1)

In [12]:
X_test_prepared = full_pipline.fit_transform(X_test)

In [13]:
y_predicted = LR_model.predict(X_test_prepared)

In [14]:
pd.DataFrame({"Bashorat" : y_predicted, "Asl qiymat" : y_test}).sample(5)

Unnamed: 0,Bashorat,Asl qiymat
6405,141467.199398,365900.0
16409,110831.083375,99700.0
4285,152991.074435,137500.0
3207,136699.699807,71400.0
2235,98090.021108,51700.0


In [15]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_predicted)
mae

70851.61738716844

In [16]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_predicted)
np.sqrt(mse)

np.float64(95214.09403883337)

**Random Forest**

In [17]:
from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

In [18]:
y_rf_predicted = RF_model.predict(X_test_prepared)

In [19]:
pd.DataFrame({"Bashorat_RF" : y_rf_predicted, "Asl qiymat" : y_test}).sample(5)

Unnamed: 0,Bashorat_RF,Asl qiymat
1213,133273.0,79900.0
8091,182096.0,146800.0
7886,224519.0,297300.0
16447,136604.0,99500.0
10129,198988.03,190300.0


In [20]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_rf_predicted)
mae

71048.70057412791

In [21]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_rf_predicted)
np.sqrt(mse)

np.float64(98205.60370419393)

**Cross-Validation**

In [28]:
X = df.drop("median_house_value", axis = 1)
Y = df["median_house_value"].copy()

X_prepared = full_pipline.fit_transform(X)

In [36]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(LR_model, X_prepared, Y, scoring = "neg_mean_squared_error", cv = 5)

In [39]:
def dispaly_scores(scores):
    print(f"Scores : {scores}")
    print(f"Mean score : {scores.mean()}")
    print(f"STD : {scores.std()}")

dispaly_scores(np.sqrt(-scores))

Scores : [110782.20644121 108541.95805036  99930.96487936  98346.98876736
  92877.29920505]
Mean score : 102095.88346867051
STD : 6644.207463669233


In [42]:
from sklearn.model_selection import cross_val_score

scores_rf = cross_val_score(RF_model, X_prepared, Y, scoring = "neg_mean_squared_error", cv = 15)

def dispaly_scores(scores):
    print(f"Scores : {scores}")
    print(f"Mean score : {scores.mean()}")
    print(f"STD : {scores.std()}")

dispaly_scores(np.sqrt(-scores_rf))

Scores : [ 89943.24214814  84073.89025338  55514.00968157  95387.93612243
  93097.80991655  46296.34153009 101536.05535386  79205.18109099
  65964.38589455  50343.02111499 100677.08985576 113464.57583229
  95181.45487284  87704.15915697  59953.49694358]
Mean score : 81222.8433178666
STD : 20028.241828757014


**Saving-Model-Pickle**

In [45]:
import pickle

file_name = "RF_model.pkl"
with open(file_name, "wb") as file:
    pickle.dump(RF_model, file)

In [46]:
with open(file_name , 'rb') as file:
    model = pickle.load(file)

**Saving-Model-Joblib**

In [48]:
import joblib

file_name = "LR_model.jbl"
joblib.dump(LR_model, file_name)

['LR_model.jbl']

In [49]:
model = joblib.load(file_name)

In [51]:
scores = cross_val_score(model, X_prepared, Y, scoring="neg_mean_squared_error", cv = 10)
dispaly_scores(np.sqrt(-scores))

Scores : [133936.82275417  83267.89031767 121494.0329793   98973.06673448
 112137.56054872  84580.82430105  71589.70424391 114488.89800889
 111325.31866155  68716.20975341]
Mean score : 100051.03283031532
STD : 20985.17628224062
