In [65]:
import numpy as np
import pandas as pd

from sklearn.pipeline import FeatureUnion

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso

from sklearn.model_selection import train_test_split #to create model and validation data splits
from sklearn.model_selection import cross_val_score #to perform k fold cross validation

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
#rmse can be mathematically calculated with mse itself.

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
housing_train = pd.read_csv('housing_train.csv') #uploading the train data
housing_test = pd.read_csv('housing_test.csv') #uploading the test data

In [5]:
housing_train.drop_duplicates()
housing_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7536 entries, 0 to 7535
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Suburb        7536 non-null   object 
 1   Address       7536 non-null   object 
 2   Rooms         7536 non-null   int64  
 3   Type          7536 non-null   object 
 4   Price         7536 non-null   int64  
 5   Method        7536 non-null   object 
 6   SellerG       7536 non-null   object 
 7   Distance      7536 non-null   float64
 8   Postcode      7536 non-null   int64  
 9   Bedroom2      5977 non-null   float64
 10  Bathroom      5977 non-null   float64
 11  Car           5977 non-null   float64
 12  Landsize      5972 non-null   float64
 13  BuildingArea  3327 non-null   float64
 14  YearBuilt     3819 non-null   float64
 15  CouncilArea   5972 non-null   object 
dtypes: float64(7), int64(3), object(6)
memory usage: 942.1+ KB


In [7]:
housing_train.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
0,Brunswick,52 Evans St,3,h,1650000,S,Nelson,5.2,3056,3.0,1.0,2.0,495.0,141.0,1920.0,Moreland
1,Reservoir,85 Radford Rd,5,h,791000,S,Ray,11.2,3073,4.0,3.0,1.0,961.0,,,Darebin
2,Newport,99 Anderson St,3,h,785000,S,RT,8.4,3015,3.0,1.0,1.0,185.0,,,Hobsons Bay
3,Brighton East,4/377 South Rd,2,u,755000,SP,Buxton,10.7,3187,,,,,,,
4,Hawthorn East,3 Jaques St,5,h,2500000,VB,RT,7.5,3123,5.0,3.0,3.0,757.0,240.0,1925.0,Boroondara


In [11]:
housing_train['SellerG'].values

array(['Nelson', 'Ray', 'RT', ..., 'Jellis', 'hockingstuart', 'Nelson'],
      dtype=object)

In [13]:
list = [housing_train['Suburb'].values]
list

[array(['Brunswick', 'Reservoir', 'Newport', ..., 'Camberwell', 'Richmond',
        'Brunswick West'], dtype=object)]

In [157]:
#creating function for pipeline to process the train and test data

from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

class VarSelector(BaseEstimator, TransformerMixin):

    def __init__(self,feature_names):

        self.feature_names=feature_names


    def fit(self,x,y=None):

        return self

    def transform(self,X):

        return X[self.feature_names]

    def get_feature_names(self):

        return self.feature_names
    
    def get_feature_names_out(self, feature_names_out):

        return self.feature_names



class custom_fico(BaseEstimator,TransformerMixin):

    def __init__(self):

        self.feature_names=['fico']

    def fit(self,x,y=None):

        return self

    def transform(self,X):

        k=X['FICO.Range'].str.split('-',expand=True).astype(float)
        
        fico=0.5*(k[0]+k[1])
        return pd.DataFrame({'fico':fico})

    def get_feature_names(self):

        return self.feature_names
    
    def get_feature_names_out(self, feature_names_out):

        return self.feature_names

class custom_age_band(BaseEstimator,TransformerMixin):

    def __init__(self):

        self.feature_names=['age_band']

    def fit(self,x,y=None):

        return self

    def transform(self,X):

        k=X['age_band'].str.split('-',expand=True)
        k[0]=pd.to_numeric(k[0],errors='coerce')
        k[1]=pd.to_numeric(k[1],errors='coerce')
        age_band=0.5*(k[0]+k[1])
        age_band=np.where(X['age_band'].str[:2]=='71',71,age_band)
        return pd.DataFrame({'age_band':age_band})

    def get_feature_names(self):

        return self.feature_names
        
    def get_feature_names_out(self, feature_names_out):

        return self.feature_names

class custom_family_income(BaseEstimator,TransformerMixin):

    def __init__(self):

        self.feature_names=['fi']

    def fit(self,x,y=None):

        return self

    def transform(self,X):

        k=X['family_income'].str.replace(',','')
        k=k.str.replace('<','')
        k=k.str.replace('>=','')
        k=k.str.strip()
        k=k.str.replace('  ',' ')
        a=k.str.split(' ',expand=True)

        a[0]=pd.to_numeric(a[0],errors='coerce')
        a[1]=pd.to_numeric(a[1],errors='coerce')
        fi=0.5*(a[0]+a[1])

        fi=np.where(k=='35000',35000,fi)
        fi=np.where(k=='4000',4000,fi)

        return pd.DataFrame({'fi':fi})

    def get_feature_names(self):

        return self.feature_names
    
    def get_feature_names_out(self, feature_names_out):

        return self.feature_names


class string_clean(BaseEstimator, TransformerMixin):

    def __init__(self,replace_it='',replace_with=''):

        self.replace_it=replace_it
        self.replace_with=replace_with
        self.feature_names=[]

    def fit(self,x,y=None):

        self.feature_names=x.columns
        return self

    def transform(self,X):

        for col in X.columns:
            X[col]=X[col].str.replace(self.replace_it,self.replace_with)
        return X

    def get_feature_names(self):

        return self.feature_names
    
    def get_feature_names_out(self, feature_names_out):

        return self.feature_names



class convert_to_numeric(BaseEstimator, TransformerMixin):

    def __init__(self):

        self.feature_names=[]

    def fit(self,x,y=None):

        self.feature_names=x.columns
        return self

    def transform(self,X):

        for col in X.columns:
            X[col]=pd.to_numeric(X[col],errors='coerce')
        return X

    def get_feature_names(self):
        return self.feature_names
    
    def get_feature_names_out(self, feature_names_out):

        return self.feature_names




class get_dummies_Pipe(BaseEstimator, TransformerMixin):

    def __init__(self,freq_cutoff=0):

        self.freq_cutoff=freq_cutoff
        self.var_cat_dict={}
        self.feature_names=[]

    def fit(self,x,y=None):

        data_cols=x.columns

        for col in data_cols:

            k=x[col].value_counts()

            if (k<=self.freq_cutoff).sum()==0:
                cats=k.index[:-1]

            else:
                cats=k.index[k>self.freq_cutoff]

            self.var_cat_dict[col]=cats

        for col in self.var_cat_dict.keys():
            for cat in self.var_cat_dict[col]:
                ## Added a fix here due to duplicate columns generated
                if col+'_'+ cat not in self.feature_names:
                    self.feature_names.append(col+'_'+cat)

        return self

    def transform(self,x,y=None):
        dummy_data=x.copy()

        for col in self.var_cat_dict.keys():
            for cat in self.var_cat_dict[col]:
                name=col+'_'+cat
                dummy_data[name]=(dummy_data[col]==cat).astype(int)

            del dummy_data[col]

        return dummy_data

    def get_feature_names(self):

        return self.feature_names
    
    def get_feature_names_out(self, feature_names_out):

        return self.feature_names




class DataFrameImputer(BaseEstimator,TransformerMixin):

    def __init__(self):

        self.impute_dict={}
        self.feature_names=[]

    def fit(self, X, y=None):

        self.feature_names=X.columns

        for col in X.columns:
            if X[col].dtype=='O':
                self.impute_dict[col]='missing'
            else:
                self.impute_dict[col]=X[col].median()
        return self

    def transform(self, X, y=None):
        return X.fillna(self.impute_dict)

    def get_feature_names(self):

        return self.feature_names
    
    def get_feature_names_out(self, feature_names_out):

        return self.feature_names

class pdPipeline(Pipeline):

    def get_feature_names(self):

        last_step = self.steps[-1][-1]

        return last_step.get_feature_names()


In [11]:
housing_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7536 entries, 0 to 7535
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Suburb        7536 non-null   object 
 1   Address       7536 non-null   object 
 2   Rooms         7536 non-null   int64  
 3   Type          7536 non-null   object 
 4   Price         7536 non-null   int64  
 5   Method        7536 non-null   object 
 6   SellerG       7536 non-null   object 
 7   Distance      7536 non-null   float64
 8   Postcode      7536 non-null   int64  
 9   Bedroom2      5977 non-null   float64
 10  Bathroom      5977 non-null   float64
 11  Car           5977 non-null   float64
 12  Landsize      5972 non-null   float64
 13  BuildingArea  3327 non-null   float64
 14  YearBuilt     3819 non-null   float64
 15  CouncilArea   5972 non-null   object 
dtypes: float64(7), int64(3), object(6)
memory usage: 942.1+ KB


In [159]:
#creating pipeline
p1 = Pipeline(
    
[
('s1',VarSelector(['Type', 'Method', 'CouncilArea'])),
('s2',DataFrameImputer()),
('s3',get_dummies_Pipe())
]
)

p2 = Pipeline(

[
('s1',VarSelector(['Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt'])),
('s2',DataFrameImputer())
]
)

In [161]:
#joining the pipeline together
union_pipe = FeatureUnion(
    [
    ('pipe1',p1),
    ('pipe2',p2)
    ]
)

In [163]:
union_pipe.fit(housing_train) #fitting the train data

In [165]:
x_train = pd.DataFrame(data = union_pipe.fit_transform(housing_train),    #transforming the train data
                       columns = union_pipe.get_feature_names_out())
x_train

Unnamed: 0,pipe1__Type_h,pipe1__Type_u,pipe1__Method_S,pipe1__Method_PI,pipe1__Method_SP,pipe1__Method_VB,pipe1__CouncilArea_missing,pipe1__CouncilArea_Boroondara,pipe1__CouncilArea_Moreland,pipe1__CouncilArea_Moonee Valley,...,pipe1__CouncilArea_Kingston,pipe2__Rooms,pipe2__Distance,pipe2__Postcode,pipe2__Bedroom2,pipe2__Bathroom,pipe2__Car,pipe2__Landsize,pipe2__BuildingArea,pipe2__YearBuilt
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,3.0,5.2,3056.0,3.0,1.0,2.0,495.0,141.0,1920.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,11.2,3073.0,4.0,3.0,1.0,961.0,121.0,1965.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,8.4,3015.0,3.0,1.0,1.0,185.0,121.0,1965.0
3,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,2.0,10.7,3187.0,3.0,1.0,1.0,329.5,121.0,1965.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,5.0,7.5,3123.0,5.0,3.0,3.0,757.0,240.0,1925.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7531,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,6.4,3011.0,1.0,1.0,0.0,0.0,121.0,2007.0
7532,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,3.0,11.5,3046.0,3.0,1.0,1.0,705.0,121.0,1965.0
7533,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,4.0,7.8,3124.0,3.0,1.0,1.0,329.5,121.0,1965.0
7534,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,2.6,3121.0,2.0,2.0,1.0,135.0,121.0,1965.0


In [89]:
x_test = pd.DataFrame(data = union_pipe.fit_transform(housing_test),   #transforming the test data
                       columns = union_pipe.get_feature_names_out())

In [91]:
housing_test

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
0,Abbotsford,6/241 Nicholson St,1,u,S,Biggin,2.5,3067,1.0,1.0,1.0,0.0,,,Yarra
1,Abbotsford,403/609 Victoria St,2,u,S,Dingle,2.5,3067,,,,,,,
2,Abbotsford,106/119 Turner St,1,u,SP,Purplebricks,2.5,3067,,,,,,,
3,Abbotsford,22 Park St,4,h,S,Biggin,2.5,3067,,,,,,,
4,Abbotsford,78 Yarra St,3,h,S,LITTLE,2.5,3067,2.0,1.0,1.0,138.0,105.0,1890.0,Yarra
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880,Keilor Park,1 Latrose St,3,h,PI,Nelson,14.5,3042,,,,,,,
1881,Keilor Park,35 Spence St,3,h,S,Nelson,14.5,3042,3.0,2.0,2.0,554.0,129.0,1980.0,Brimbank
1882,Gardenvale,66A Gardenvale Rd,3,h,PI,Nick,9.8,3185,3.0,2.0,2.0,236.0,193.0,2004.0,Glen Eira
1883,Gardenvale,2/32 Gardenia Rd,2,u,SP,Biggin,9.8,3185,2.0,1.0,1.0,0.0,65.0,1970.0,Glen Eira


In [93]:
y_train = housing_train['Price']

In [95]:
y_train.shape

(7536,)

In [97]:
x_train.shape

(7536, 34)

In [99]:
x_test.shape

(1885, 34)

In [101]:
linear_model_1 = LinearRegression() #initiating a linear regression model

In [117]:
linear_model_1.fit(x_train,y_train) #fitting the linear model

In [119]:
y_test_predict = linear_model_1.predict(x_test) #predicting on test data

In [121]:
len(y_test_predict)

1885

In [123]:
y_train_predict = linear_model_1.predict(x_train)

In [125]:
mean_absolute_error(y_train,y_train_predict)

267671.3125104241

In [127]:
rmse = mean_squared_error(y_train,y_train_predict)**0.5

In [129]:
rmse

405567.1334500874

In [47]:
212467/405567.1334500874

0.5238762771346412

In [131]:
x_train_model, x_validation,y_train_model, y_validation = train_test_split(x_train, y_train,train_size = 0.8)


In [133]:
linear_model = LinearRegression()
linear_model.fit(x_train_model,y_train_model)

In [135]:
y_validation_predict = linear_model.predict(x_validation)

In [137]:
y_validation_predict

array([1235328.29131395, 1004725.74666997,  968541.78386866, ...,
       1783918.05700388,  123935.5935796 ,  627261.25878839])

In [139]:
r2_score(y_validation,y_validation_predict)

0.6302011902439937

In [141]:
mean_squared_error(y_validation,y_validation_predict)**0.5

393762.5710073849

In [143]:
212467/393762.5710073849

0.5395815032811111

In [74]:
y_test_predict = linear_model.predict(x_test)

In [76]:
y_test_predict

array([ 475100.71556551,  867342.23277219,  533426.09987218, ...,
       1327958.19646622,  504688.44899902, 1212681.16987305])

In [78]:
Price = y_test_predict

In [80]:
Price

array([ 475100.71556551,  867342.23277219,  533426.09987218, ...,
       1327958.19646622,  504688.44899902, 1212681.16987305])

In [145]:
rf_model = RandomForestRegressor() #default implementation

#fit the model on train data (80% here)
rf_model.fit(x_train_model,y_train_model)

y_validation_prediction = rf_model.predict(x_validation)

In [146]:
mean_squared_error(y_validation,y_validation_prediction)**0.5

292695.41064310755

In [149]:
212467/292695.41064310755

0.7258979549189704

In [151]:
y_test_predict_final = rf_model.predict(x_test)

In [153]:
y_test_predict_final

array([ 431795.83333333,  733015.66666667,  417035.        , ...,
       1428040.        ,  483445.        , 1137560.        ])

In [155]:
for i in range(0, len(y_test_predict_final)):
    print(y_test_predict_final[i])

431795.83333333326
733015.6666666667
417035.0
1653576.666666667
1156650.0
1182368.3928571427
1007898.4722222222
465211.83333333326
435180.0
1154235.0
983913.3333333335
515434.0
1113970.0
687931.6666666667
686025.8571428572
651976.6642246642
471430.0
623419.7619047619
1017195.0
597581.076923077
918296.25
645585.0
882105.0
489155.0
687533.076923077
583466.0
714610.0
741670.0
688923.0
481067.5
690924.1581196582
2247790.0
1231465.0
1504950.0
2003286.4761904762
1816558.0
1367619.0
2246870.0
758564.619047619
1966361.0
2578855.0
825841.5
496697.0
1722980.0
512677.87499999994
834401.0
989010.0
526665.0
1197254.98
891845.6666666665
751120.8333333333
736116.6666666665
812149.1666666665
710502.2619047619
688525.0
824504.83
710502.2619047619
777223.7038655789
630575.0
777223.7038655789
802875.0
907507.3233333332
894209.5
697520.0
660890.0
704551.3571428572
605821.0
756155.0
2405270.0
435162.33333333326
694152.1904761904
1392420.0
1618853.0
791745.0
937175.0
635690.0
1100530.0
998959.1666666669
118

In [169]:
len(y_test_predict_final)

1885

In [171]:
len(x_test)

1885