In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.model_selection  import cross_val_score

from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

In [29]:
housing_df = pd.read_csv('housing.csv')
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [9]:
# for use ml we will extract the features matrix and the target array
X = housing_df.drop('median_house_value',axis=1)
y = housing_df['median_house_value']
display(type(X),type(y),type(y.values))
X.head(),y.values

pandas.core.frame.DataFrame

pandas.core.series.Series

numpy.ndarray

(   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
 0    -122.23     37.88                41.0        880.0           129.0   
 1    -122.22     37.86                21.0       7099.0          1106.0   
 2    -122.24     37.85                52.0       1467.0           190.0   
 3    -122.25     37.85                52.0       1274.0           235.0   
 4    -122.25     37.85                52.0       1627.0           280.0   
 
    population  households  median_income ocean_proximity  
 0       322.0       126.0         8.3252        NEAR BAY  
 1      2401.0      1138.0         8.3014        NEAR BAY  
 2       496.0       177.0         7.2574        NEAR BAY  
 3       558.0       219.0         5.6431        NEAR BAY  
 4       565.0       259.0         3.8462        NEAR BAY  ,
 'y.values',
 array([452600., 358500., 352100., ...,  92300.,  84700.,  89400.]))


# Train a linear regression

In [20]:
x = housing_df.loc[:,['median_income','total_rooms','latitude']]
y = housing_df['median_house_value']

lin_reg = LinearRegression()
lin_reg.fit(x,y)

#LinearRegression learned parameters
lin_reg.coef_, lin_reg.intercept_

y_predict = lin_reg.predict(x)
lin_reg_rmse = np.sqrt(mse(y,y_predict))
lin_reg_rmse

83092.48539363865

# Feature Extraction


In [23]:
housing_df.corr()['median_house_value']

longitude            -0.045967
latitude             -0.144160
housing_median_age    0.105623
total_rooms           0.134153
total_bedrooms        0.049686
population           -0.024650
households            0.065843
median_income         0.688075
median_house_value    1.000000
Name: median_house_value, dtype: float64

# Handling Categorial Attributes - LabelEncoder & OneHotEncoder

In [4]:
encoder = LabelEncoder()
housing_df['encoder_ocean_proximity'] = encoder.fit_transform(housing_df['ocean_proximity'])
housing_df.head().loc[:,['ocean_proximity','encoder_ocean_proximity']]

Unnamed: 0,ocean_proximity,encoder_ocean_proximity
0,NEAR BAY,3
1,NEAR BAY,3
2,NEAR BAY,3
3,NEAR BAY,3
4,NEAR BAY,3


In [5]:
encoder_one = OneHotEncoder()
encoder_one_ocean_proximity = encoder_one.fit_transform(housing_df['ocean_proximity'].values.reshape(-1,1))
encoder_one_ocean_proximity

<20640x5 sparse matrix of type '<class 'numpy.float64'>'
	with 20640 stored elements in Compressed Sparse Row format>

In [6]:
encoder_one_ocean_proximity.toarray()
display(type(encoder_one_ocean_proximity) ,type(encoder_one_ocean_proximity.toarray()),type(housing_df))

scipy.sparse.csr.csr_matrix

numpy.ndarray

pandas.core.frame.DataFrame

In [7]:
#replace the original ocean_proximity in the feature matrix X with the one hot
housing_df.drop('ocean_proximity',axis=1,inplace=True)
x = np.append(housing_df.values , encoder_one_ocean_proximity.toarray(),axis=1)
x.shape
x

array([[-122.23,   37.88,   41.  , ...,    0.  ,    1.  ,    0.  ],
       [-122.22,   37.86,   21.  , ...,    0.  ,    1.  ,    0.  ],
       [-122.24,   37.85,   52.  , ...,    0.  ,    1.  ,    0.  ],
       ...,
       [-121.22,   39.43,   17.  , ...,    0.  ,    0.  ,    0.  ],
       [-121.32,   39.43,   18.  , ...,    0.  ,    0.  ,    0.  ],
       [-121.24,   39.37,   16.  , ...,    0.  ,    0.  ,    0.  ]])

# imputation of missing data - SimpleImputer

In [13]:
housing_df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [16]:
#replace the missing values with the median of the 'total_bedrooms' feature
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(housing_df.total_bedrooms.values.reshape(-1,1))
X[:10]
np.isnan(X).any()
housing_df.total_bedrooms = X
housing_df.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

# Feature Scaling - MInMaxScaler and StandardScaler

In [26]:
scaler = StandardScaler()
housing_df.drop('ocean_proximity',axis=1,inplace=True)
scaler.fit_transform(housing_df.median_income.values.reshape(-1,1))

array([[ 2.34476576],
       [ 2.33223796],
       [ 1.7826994 ],
       ...,
       [-1.14259331],
       [-1.05458292],
       [-0.78012947]])

# Create a TestSet - Ex.

In [41]:
#read data
df = pd.read_csv('housing.csv')
df.head()

X = df.drop('median_house_value',axis=1)
X = X.drop('ocean_proximity',axis=1)

y = df['median_house_value'].values

X,y

#imputer before split
imputer = SimpleImputer(strategy='median')
tb = imputer.fit_transform(X.total_bedrooms.values.reshape(-1,1))
print(np.isnan(X).any())
X.total_bedrooms = tb
X.isna().sum()

#split the data
X_train,X_test,y_train ,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)
display(X_train.shape,X_test.shape,y_train ,y_test)

#train LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)
display('lin_reg.coef_,lin_reg.intercept_',lin_reg.coef_,lin_reg.intercept_)

#try the model and compare to the true lable (on the train set)
predictions = lin_reg.predict(X_train)
display(predictions , y_train[0:5])

#measure the regression models rmse
lin_reg_rmse = np.sqrt(mse(y_train,predictions))
display('lin_reg_rmse',lin_reg_rmse)

#train DecisionTreeRegressor and check rmse now
tree = DecisionTreeRegressor()
tree.fit(X_train,y_train)
display(lin_reg.coef_,lin_reg.intercept_)
predictions = tree.predict(X_train)
display(predictions)
tree_reg_rmse = np.sqrt(mse(y_train,predictions))
display('tree_reg_rmse',tree_reg_rmse)

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms         True
population            False
households            False
median_income         False
dtype: bool


(16512, 8)

(4128, 8)

array([103000., 382100., 172600., ..., 222100., 283500., 325000.])

array([ 47700.,  45800., 500001., ..., 500001.,  72300., 151500.])

'lin_reg.coef_,lin_reg.intercept_'

array([-4.26323917e+04, -4.24500719e+04,  1.18280965e+03, -8.18797708e+00,
        1.16260128e+02, -3.84922131e+01,  4.63425720e+01,  4.05384044e+04])

-3578224.234818088

array([181313.23430336, 286451.78145112, 263328.07605751, ...,
       191338.67423602, 273178.05856515, 278432.75202799])

array([103000., 382100., 172600.,  93400.,  96500.])

'lin_reg_rmse'

69362.34135238081

array([-4.26323917e+04, -4.24500719e+04,  1.18280965e+03, -8.18797708e+00,
        1.16260128e+02, -3.84922131e+01,  4.63425720e+01,  4.05384044e+04])

-3578224.234818088

array([103000., 382100., 172600., ..., 222100., 283500., 325000.])

'tree_reg_rmse'

0.0

# Scikit-Learn's Pipelines + ColumnTransformer

In [9]:
df = pd.read_csv('housing.csv')
df.head()

X = df.drop('median_house_value',axis=1)
#X = X.drop('ocean_proximity',axis=1)

y = df['median_house_value'].values

num_features = ['total_bedrooms']
num_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])

cat_features = ['ocean_proximity']
cat_transformer = Pipeline([('onehot',OneHotEncoder())])


preprocessor = ColumnTransformer([('num',num_transformer,num_features),
                                  ('cat',cat_transformer,cat_features)],remainder='passthrough')

clf = Pipeline([('pre',preprocessor),
               ('cls',DecisionTreeRegressor())])

X_train,X_test,y_train ,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)
X_train.shape,X_test.shape,y_train ,y_test

x_prepared = clf.fit(X_train,y_train)
x_prepared

predictions = clf.predict(X_train)
display(predictions)
tree_reg_rmse = np.sqrt(mse(y_train,predictions))
display(tree_reg_rmse)

#predict - call transform & predict
predictions = clf.predict(X_test)
display(predictions)
tree_reg_rmse = np.sqrt(mse(y_test,predictions))
display(tree_reg_rmse)

clf.score(X_test,y_test),clf.score(X_train,y_train)

array([103000., 382100., 172600., ..., 222100., 283500., 325000.])

0.0

array([ 40900.,  26600., 500001., ..., 500001.,  65600., 128600.])

68492.79633224127

(0.642000075754406, 1.0)

# Scikit-Learn's Custom Transformer


In [15]:
class CombinedAttributeAdder(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        new_col = X.iloc[:,0] / X.iloc[:,1]
        return np.column_stack([X,new_col])

df = pd.read_csv('housing.csv')
df.head()

X = df.drop('median_house_value',axis=1)
y = df['median_house_value'].values

#X,y
X.head(0)
y

attr_adder = CombinedAttributeAdder()
t = attr_adder.transform(X,y)
t[0]

num_features = ['total_bedrooms']
num_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])

cat_features = ['ocean_proximity']
cat_transformer = Pipeline([('onehot',OneHotEncoder())])

my_features = ['median_income','population']
my_transformer = Pipeline([('my',CombinedAttributeAdder())])


preprocessor = ColumnTransformer([('my',my_transformer,my_features),
                                  ('num',num_transformer,num_features),
                                  ('cat',cat_transformer,cat_features)],remainder='passthrough')

clf = Pipeline([('pre',preprocessor),
               ('cls',DecisionTreeRegressor())])

X_train,X_test,y_train ,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)
X_train.shape,X_test.shape,y_train ,y_test

x_prepared = clf.fit(X_train,y_train)
x_prepared

clf.score(X_test,y_test),clf.score(X_train,y_train)

(0.6399948378346952, 1.0)

# Model Evaluation - Cross Validation

In [27]:
df = pd.read_csv('housing.csv')
df.head()

X = df.drop('median_house_value',axis=1)
#X = X.drop('ocean_proximity',axis=1)

y = df['median_house_value'].values

#X,y
X.head(0)

X_train,X_test,y_train ,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)
X_train.shape,X_test.shape,y_train ,y_test


num_features = ['total_bedrooms']
num_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])

cat_features = ['ocean_proximity']
cat_transformer = Pipeline([('onehot',OneHotEncoder())])


preprocessor = ColumnTransformer([('num',num_transformer,num_features),
                                  ('cat',cat_transformer,cat_features)],remainder='passthrough')





In [34]:
#LinearRegression
clf = Pipeline([('pre',preprocessor),
               ('cls',LinearRegression())])

scores = cross_val_score(clf,X_train,y_train,cv =10,scoring='neg_mean_squared_error')
-scores,(-scores).mean(),scores.std()


(array([4.30092727e+09, 5.14251884e+09, 4.64347170e+09, 4.46966489e+09,
        4.82196640e+09, 4.30865752e+09, 4.33772031e+09, 4.88577660e+09,
        5.34623416e+09, 4.85867228e+09]), 4711560998.797654, 344466959.068248)

In [None]:

#DecisionTreeRegressor
clf = Pipeline([('pre',preprocessor),
               ('cls',DecisionTreeRegressor())])

scores = cross_val_score(clf,X_train,y_train,cv =10,scoring='neg_mean_squared_error')
scores,scores.mean(),scores.std()


In [31]:


#RandomForestRegressor
clf = Pipeline([('pre',preprocessor),
               ('cls',RandomForestRegressor())])

scores = cross_val_score(clf,X_train,y_train,cv =10,scoring='neg_mean_squared_error')
-scores,np.sqrt(-scores).mean(),scores.std()
np.sqrt(-scores)

KeyboardInterrupt: 