In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse


In [13]:
df = pd.read_csv('housing.csv')
df.head()

#explore data set
#display(df.shape,df.head(),df.tail(),df.columns,df.info(),df.describe().T)
#display(df.isna().sum(),df.nunique())

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
lin_reg = LinearRegression()
lin_reg

In [None]:
X =  df.loc[:,['median_income','total_rooms','latitude']]
display(X.head(),X.shape)

In [None]:
y = df['median_house_value'].values
y

In [None]:
lin_reg.fit(X,y)

In [None]:
display(lin_reg.coef_,lin_reg.intercept_)

In [None]:
predictions = lin_reg.predict(X)
lin_reg_rmse = np.sqrt(mse(y,predictions))
lin_reg_rmse

In [None]:
#coef = lin_reg.coef_[:,np.newaxis]
#display(coef.shape)
(X.head().shape)
X.head() @ lin_reg.coef_ + lin_reg.intercept_
predictions[0:5]

In [None]:
df.corr()['median_house_value']

### label encoder

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder = LabelEncoder()

In [None]:
df['encoder_ocean_proximity'] = encoder.fit_transform(df['ocean_proximity'])
df

### OneHotEncoder

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder_one = OneHotEncoder()

In [None]:
#encoder_one.get_feature_names

In [None]:
encoder_one_ocean_proximity = encoder_one.fit_transform(df['ocean_proximity'].to_numpy().reshape(-1,1))
encoder_one_ocean_proximity


In [None]:
encoder_one_ocean_proximity.toarray()

In [None]:
display(type(encoder_one_ocean_proximity) ,type(encoder_one_ocean_proximity.toarray()),type(df))
#concat with original data
#df = pd.concat([df, ohe_df], axis=1).drop(['ocean_proximity'], axis=1)


In [None]:
df.drop('ocean_proximity',axis=1,inplace=True)

x = np.append(df.values , encoder_one_ocean_proximity.toarray(),axis=1)
x.shape

In [None]:
encoder_one.get_feature_names()

### imputation

In [None]:
df.isna().sum()

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(df.total_bedrooms.to_numpy().reshape(-1,1))
X[:10]
np.isnan(X).any()
df.total_bedrooms = X
df.isna().sum()

## scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
df.drop('ocean_proximity',axis=1,inplace=True)

In [None]:
scaler.fit_transform(df.median_income.to_numpy().reshape(-1,1))

## test set

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [None]:
df = pd.read_csv('housing.csv')
df.head()

X = df.drop('median_house_value',axis=1)
X = X.drop('ocean_proximity',axis=1)

y = df['median_house_value'].values

X,y


In [None]:
imputer = SimpleImputer(strategy='median')
tb = imputer.fit_transform(X.total_bedrooms.to_numpy().reshape(-1,1))
print(np.isnan(X).any())
X.total_bedrooms = tb
X.isna().sum()

In [None]:
X_train,X_test,y_train ,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)
X_train.shape,X_test.shape,y_train ,y_test

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)
display(lin_reg.coef_,lin_reg.intercept_)

In [None]:
predictions = lin_reg.predict(X_train)
predictions , y_train[0:5]
lin_reg_rmse = np.sqrt(mse(y_train,predictions))
lin_reg_rmse

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()
tree.fit(X_train,y_train)
display(lin_reg.coef_,lin_reg.intercept_)
predictions = tree.predict(X_train)
display(predictions)
tree_reg_rmse = np.sqrt(mse(y_train,predictions))
display(tree_reg_rmse)

### pipeline

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as mse

In [2]:
df = pd.read_csv('housing.csv')
df.head()

X = df.drop('median_house_value',axis=1)
#X = X.drop('ocean_proximity',axis=1)

y = df['median_house_value'].values

#X,y

In [5]:
num_features = ['total_bedrooms']
num_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])

cat_features = ['ocean_proximity']
cat_transformer = Pipeline([('onehot',OneHotEncoder())])


preprocessor = ColumnTransformer([('num',num_transformer,num_features),
                                  ('cat',cat_transformer,cat_features)],remainder='passthrough')

clf = Pipeline([('pre',preprocessor),
               ('cls',DecisionTreeRegressor())])

In [6]:
X_train,X_test,y_train ,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)
X_train.shape,X_test.shape,y_train ,y_test



((16512, 9),
 (4128, 9),
 array([103000., 382100., 172600., ..., 222100., 283500., 325000.]),
 array([ 47700.,  45800., 500001., ..., 500001.,  72300., 151500.]))

In [7]:
#fit - call fit & transform
x_prepared = clf.fit(X_train,y_train)
x_prepared

Pipeline(memory=None,
         steps=[('pre',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                               

In [8]:
predictions = clf.predict(X_train)
display(predictions)
tree_reg_rmse = np.sqrt(mse(y_train,predictions))
display(tree_reg_rmse)

array([103000., 382100., 172600., ..., 222100., 283500., 325000.])

0.0

In [9]:
#predict - call transform & predict
predictions = clf.predict(X_test)
display(predictions)
tree_reg_rmse = np.sqrt(mse(y_test,predictions))
display(tree_reg_rmse)

array([ 40900.,  26600., 483300., ..., 500001.,  67000., 128600.])

69219.6476583175

In [10]:
clf.score(X_test,y_test),clf.score(X_train,y_train)

(0.6343615083915584, 1.0)

### custom transformer

In [92]:
from sklearn.base import BaseEstimator,TransformerMixin
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as mse

In [101]:
class CombinedAttributeAdder(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        density = X.iloc[:,0] / X.iloc[:,1]
        #return np.column_stack([X,density,y])
        return np.column_stack([X,density])



In [102]:
df = pd.read_csv('housing.csv')
df.head()

X = df.drop('median_house_value',axis=1)
#X = X.drop('ocean_proximity',axis=1)

y = df['median_house_value'].values

#X,y
X.head(0)
y

array([452600., 358500., 352100., ...,  92300.,  84700.,  89400.])

In [103]:
attr_adder = CombinedAttributeAdder()
t = attr_adder.transform(X,y)
t[0]

array([-122.23, 37.88, 41.0, 880.0, 129.0, 322.0, 126.0, 8.3252,
       'NEAR BAY', -3.226768743400211], dtype=object)

In [104]:
num_features = ['total_bedrooms']
num_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])

cat_features = ['ocean_proximity']
cat_transformer = Pipeline([('onehot',OneHotEncoder())])

my_features = ['median_income','population']
my_transformer = Pipeline([('my',CombinedAttributeAdder())])


preprocessor = ColumnTransformer([('my',my_transformer,my_features),
                                  ('num',num_transformer,num_features),
                                  ('cat',cat_transformer,cat_features)],remainder='passthrough')

clf = Pipeline([('pre',preprocessor),
               ('cls',DecisionTreeRegressor())])

In [105]:
X_train,X_test,y_train ,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)
X_train.shape,X_test.shape,y_train ,y_test

((16512, 9),
 (4128, 9),
 array([103000., 382100., 172600., ..., 222100., 283500., 325000.]),
 array([ 47700.,  45800., 500001., ..., 500001.,  72300., 151500.]))

In [106]:
x_prepared = clf.fit(X_train,y_train)
x_prepared

Pipeline(memory=None,
         steps=[('pre',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('my',
                                                  Pipeline(memory=None,
                                                           steps=[('my',
                                                                   CombinedAttributeAdder())],
                                                           verbose=False),
                                                  ['median_income',
                                                   'population']),
                                                 ('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                  

In [107]:
clf.score(X_test,y_test),clf.score(X_train,y_train)

(0.6350538917010636, 1.0)

### cross validation

In [24]:
from sklearn.model_selection  import cross_val_score
from sklearn.base import BaseEstimator,TransformerMixin
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv('housing.csv')
df.head()

X = df.drop('median_house_value',axis=1)
#X = X.drop('ocean_proximity',axis=1)

y = df['median_house_value'].values

#X,y
X.head(0)

X_train,X_test,y_train ,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)
X_train.shape,X_test.shape,y_train ,y_test


num_features = ['total_bedrooms']
num_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])

cat_features = ['ocean_proximity']
cat_transformer = Pipeline([('onehot',OneHotEncoder())])


preprocessor = ColumnTransformer([('num',num_transformer,num_features),
                                  ('cat',cat_transformer,cat_features)],remainder='passthrough')



In [25]:
clf = Pipeline([('pre',preprocessor),
               ('cls',LinearRegression())])

scores = cross_val_score(clf,X_train,y_train,cv =10,scoring='neg_mean_squared_error')
scores,scores.mean(),scores.std()

(array([-4.30092727e+09, -5.14251884e+09, -4.64347170e+09, -4.46966489e+09,
        -4.82196640e+09, -4.30865752e+09, -4.33772031e+09, -4.88577660e+09,
        -5.34623416e+09, -4.85867228e+09]),
 -4711560998.797625,
 344466959.0682897)

In [67]:
clf = Pipeline([('pre',preprocessor),
               ('cls',DecisionTreeRegressor())])

scores = cross_val_score(clf,X_train,y_train,cv =10,scoring='neg_mean_squared_error')
scores,scores.mean(),scores.std()

(array([-4.46568298e+09, -4.74624436e+09, -4.74467555e+09, -5.03198614e+09,
        -4.72130567e+09, -4.57477463e+09, -3.87087452e+09, -5.07394359e+09,
        -4.79190065e+09, -4.65437468e+09]),
 -4667576277.486812,
 318227328.455716)

In [70]:
#each score is the mse
clf = Pipeline([('pre',preprocessor),
               ('cls',RandomForestRegressor())])

scores = cross_val_score(clf,X_train,y_train,cv =10,scoring='neg_mean_squared_error')
-scores,np.sqrt(-scores).mean(),scores.std()



(array([2.44050574e+09, 2.88401339e+09, 2.56073155e+09, 2.59102032e+09,
        2.73240511e+09, 2.32652950e+09, 2.28971701e+09, 2.81951112e+09,
        2.85450666e+09, 2.81819213e+09]),
 51258.10342457015,
 211382759.68603936)

### Grid Search

In [73]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection  import cross_val_score
from sklearn.base import BaseEstimator,TransformerMixin
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


df = pd.read_csv('housing.csv')
df.head()

X = df.drop('median_house_value',axis=1)
#X = X.drop('ocean_proximity',axis=1)

y = df['median_house_value'].values

#X,y
X.head(0)

X_train,X_test,y_train ,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)
X_train.shape,X_test.shape,y_train ,y_test


num_features = ['total_bedrooms']
num_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])

cat_features = ['ocean_proximity']
cat_transformer = Pipeline([('onehot',OneHotEncoder())])


preprocessor = ColumnTransformer([('num',num_transformer,num_features),
                                  ('cat',cat_transformer,cat_features)],remainder='passthrough')




In [74]:
# n_estimators='warn',
# criterion='mse',
# max_depth=None,
    
clf = Pipeline([('pre',preprocessor),
               ('cls',RandomForestRegressor())])



In [85]:
param_grid = {
    'cls__n_estimators': [5,15],
    #'cls__criterion': ['mse','mae'],
    'cls__max_depth': [5,7]
}

grid_search = GridSearchCV(clf,param_grid,cv=10,scoring='neg_mean_squared_error')
grid_search.fit(X_train,y_train)

KeyboardInterrupt: 

In [84]:
print(grid_search.best_params_)

results = grid_search.cv_results_
for mean_score,params in zip(results['mean_test_score'],results['params']):
    print(mean_score,params)

{'cls__max_depth': 7, 'cls__n_estimators': 15}
-4657569949.600316 {'cls__max_depth': 5, 'cls__n_estimators': 5}
-4591369485.048018 {'cls__max_depth': 5, 'cls__n_estimators': 15}
-3804191297.1670146 {'cls__max_depth': 7, 'cls__n_estimators': 5}
-3703220733.365717 {'cls__max_depth': 7, 'cls__n_estimators': 15}


## polynomianl regression - find the best polynomianl

In [67]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection  import cross_val_score
from sklearn.base import BaseEstimator,TransformerMixin
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures

In [68]:

#poly
#linear
#predic

df = pd.read_csv('housing.csv')
df.head()

X = df.drop('median_house_value',axis=1)

y = df['median_house_value'].values

#X,y
X.head(0)

num_features = ['total_bedrooms']
num_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])

cat_features = ['ocean_proximity']
cat_transformer = Pipeline([('onehot',OneHotEncoder())])


preprocessor = ColumnTransformer([('num',num_transformer,num_features),
                                  ('cat',cat_transformer,cat_features)],remainder='passthrough')

x_clean = preprocessor.fit_transform(X)
x_clean


array([[1.2900e+02, 0.0000e+00, 0.0000e+00, ..., 3.2200e+02, 1.2600e+02,
        8.3252e+00],
       [1.1060e+03, 0.0000e+00, 0.0000e+00, ..., 2.4010e+03, 1.1380e+03,
        8.3014e+00],
       [1.9000e+02, 0.0000e+00, 0.0000e+00, ..., 4.9600e+02, 1.7700e+02,
        7.2574e+00],
       ...,
       [4.8500e+02, 0.0000e+00, 1.0000e+00, ..., 1.0070e+03, 4.3300e+02,
        1.7000e+00],
       [4.0900e+02, 0.0000e+00, 1.0000e+00, ..., 7.4100e+02, 3.4900e+02,
        1.8672e+00],
       [6.1600e+02, 0.0000e+00, 1.0000e+00, ..., 1.3870e+03, 5.3000e+02,
        2.3886e+00]])

In [69]:
X_train,X_test,y_train ,y_test = train_test_split(x_clean,y,test_size = 0.2,random_state = 42)
X_train.shape,X_test.shape,y_train ,y_test


((16512, 13),
 (4128, 13),
 array([103000., 382100., 172600., ..., 222100., 283500., 325000.]),
 array([ 47700.,  45800., 500001., ..., 500001.,  72300., 151500.]))

In [70]:
poly = PolynomialFeatures(2) # 3 is worst
x_poly_train = poly.fit_transform(X_train)

x_poly_test = poly.transform(X_test)

#results = grid_search.cv_results_
#for mean_score,params in zip(results['mean_test_score'],results['params']):
#    print(mean_score,params)

In [71]:
lin_poly = LinearRegression()
lin_poly.fit(x_poly_train,y_train)
lin_poly.coef_,lin_poly.intercept_

prediction_poly = lin_poly.predict(x_poly_train)
prediction_poly

prediction_poly_test = lin_poly.predict(x_poly_test)
prediction_poly_test

rmse_poly_train = np.sqrt(mse(prediction_poly,y_train))
rmse_poly_train

rmse_poly_test = np.sqrt(mse(prediction_poly_test,y_test))


rmse_poly_train, rmse_poly_test

(54482.17173435371, 114142.14922040507)

In [72]:
lin = LinearRegression()
lin.fit(X_train,y_train)
lin.coef_,lin.intercept_

prediction = lin.predict(X_train)
prediction

rmse_train = np.sqrt(mse(prediction,y_train))
rmse_train

prediction_test = lin.predict(X_test)
prediction_test

rmse_train_test = np.sqrt(mse(prediction_test,y_test))

rmse_train , rmse_train_test


(68433.93736666226, 70060.52184473486)