In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn import model_selection as mod
from sklearn import impute as imp
from sklearn import pipeline as pip
from sklearn import preprocessing as pre
from sklearn import compose as com
from sklearn import linear_model as lin
from sklearn import tree
from sklearn.experimental import enable_iterative_imputer
from sklearn import svm
from sklearn import metrics as met
from sklearn import ensemble as ens

from xgboost import XGBRegressor

In [3]:
#Read Data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train.shape, df_test.shape

((18000, 22), (9247, 21))

In [4]:
df_train.head()

Unnamed: 0,ID,Distance,Date,SellerG,Propertycount,Car,Method,CouncilArea,Suburb,Bathroom,...,Type,Postcode,BuildingArea,Lattitude,Bedroom2,Landsize,YearBuilt,Longtitude,Rooms,Price
0,0,28.8,4/11/2017,Barry,3533.0,3.0,S,Casey City Council,Doveton,1.0,...,h,3177.0,,-37.9941,3.0,,,145.2329,3.0,614000.0
1,1,25.9,25/11/2017,HAR,5812.0,2.0,S,Whittlesea City Council,Mernda,2.0,...,h,3754.0,,-37.6132,4.0,,,145.0903,4.0,555000.0
2,2,11.2,24/09/2016,Ray,21650.0,,S,Darebin City Council,Reservoir,1.0,...,h,3073.0,,-37.705,3.0,771.0,1965.0,145.0035,4.0,801000.0
3,3,3.3,12/11/2016,Williams,14887.0,1.0,S,Melbourne City Council,South Yarra,1.0,...,u,3141.0,,-37.8356,1.0,630.0,,144.9773,1.0,555000.0
4,4,,25/02/2017,Trimson,7570.0,1.0,S,Maribyrnong City Council,Footscray,1.0,...,u,3011.0,,-37.8012,2.0,0.0,,144.8964,2.0,360000.0


In [5]:
df_train.isnull().sum()

ID                   0
Distance           182
Date               209
SellerG            188
Propertycount      174
Car               4697
Method             179
CouncilArea        168
Suburb             170
Bathroom          4436
Regionname         186
Address            189
Type               215
Postcode           211
BuildingArea     11125
Lattitude         4311
Bedroom2          4423
Landsize          6207
YearBuilt        10138
Longtitude        4317
Rooms              150
Price                0
dtype: int64

In [6]:
df_test.isnull().sum()

ID                  0
Distance           82
Date              107
SellerG            93
Propertycount      95
Car              2354
Method             74
CouncilArea       115
Suburb             90
Bathroom         2217
Regionname         93
Address            96
Type              104
Postcode           88
BuildingArea     5590
Lattitude        2164
Bedroom2         2222
Landsize         3225
YearBuilt        5121
Longtitude       2153
Rooms              92
dtype: int64

In [7]:
for column in df_train.select_dtypes(include='object').columns:
  print('for column',column, 'has -->', len(df_train[column].value_counts()), 'unique','-->max' , df_train[column].value_counts()[0] )

for column Date has --> 78 unique -->max 579
for column SellerG has --> 317 unique -->max 1768
for column Method has --> 5 unique -->max 11407
for column CouncilArea has --> 33 unique -->max 1642
for column Suburb has --> 339 unique -->max 472
for column Regionname has --> 8 unique -->max 5557
for column Address has --> 17599 unique -->max 3
for column Type has --> 3 unique -->max 11979


In [8]:
#Date>>Year, Drop Date for both df_train and df_test
df_train['Year'] =  pd.to_datetime(df_train['Date'], dayfirst=True).dt.year
df_train['Month'] =  pd.to_datetime(df_train['Date'], dayfirst=True).dt.month
df_train = df_train.drop(['Date'],axis=1)
df_test['Year'] =  pd.to_datetime(df_test['Date'], dayfirst=True).dt.year
df_test['Month'] =  pd.to_datetime(df_test['Date'], dayfirst=True).dt.month
df_test = df_test.drop('Date',axis=1)

In [9]:
df_train['Month']

0        11.0
1        11.0
2         9.0
3        11.0
4         2.0
         ... 
17995     8.0
17996     7.0
17997     8.0
17998     4.0
17999    11.0
Name: Month, Length: 18000, dtype: float64

In [10]:
df_train = df_train.drop(['ID', 'Address', 'Lattitude', 'Longtitude'], axis=1)
df_test = df_test.drop(['ID', 'Address', 'Lattitude', 'Longtitude'], axis=1)

In [11]:
for column in df_train.select_dtypes(include='object').columns:
  print('for column',column, 'has -->', len(df_train[column].value_counts()), 'unique','-->max' , df_train[column].value_counts()[0] )

for column SellerG has --> 317 unique -->max 1768
for column Method has --> 5 unique -->max 11407
for column CouncilArea has --> 33 unique -->max 1642
for column Suburb has --> 339 unique -->max 472
for column Regionname has --> 8 unique -->max 5557
for column Type has --> 3 unique -->max 11979


In [12]:
#split X and y
X = df_train.drop('Price', axis=1)
y = df_train['Price']
X.shape, y.shape

((18000, 18), (18000,))

In [13]:
X.select_dtypes(exclude='object').columns

Index(['Distance', 'Propertycount', 'Car', 'Bathroom', 'Postcode',
       'BuildingArea', 'Bedroom2', 'Landsize', 'YearBuilt', 'Rooms', 'Year',
       'Month'],
      dtype='object')

In [14]:
columns_for_itr = ['YearBuilt', 'BuildingArea']
columns_for_knn = ['Distance', 'Propertycount', 'Postcode', 'Landsize', 'Rooms', 'Year', 'Month']

In [16]:
#GridSearch for RandomForest with IterativeImputer and KNN

pipe_cat = pip.Pipeline([
    ("imp", imp.SimpleImputer(strategy="constant", fill_value='missing')),
    ("encoder" , pre.OneHotEncoder(handle_unknown='ignore')),
])


pipe_itr = pip.Pipeline([
    ("scaler", pre.StandardScaler()),
    ("imp", imp.IterativeImputer(estimator=lin.LinearRegression()))
])


pipe_knn = pip.Pipeline([
    ("scaler", pre.StandardScaler()),
    ("imp", imp.KNNImputer())
])
ct_raw = com.ColumnTransformer([
    ('cat', pipe_cat, X.select_dtypes(include='object').columns),
    ('knn', pipe_knn, X.select_dtypes(exclude='object').columns)
    #('itr', pipe_itr, columns_for_itr)
    
])

pipe_model = pip.Pipeline([
    ('ct', ct_raw),
    ('est', ens.RandomForestRegressor(random_state=42))
])

params = {
    'ct__knn__imp__n_neighbors': [3,4,7],
    'ct__knn__imp__weights': ['uniform', 'distance'],
    'est__n_estimators': [100,200,500]

}

grid_knn_rndForest = mod.RandomizedSearchCV(pipe_model, params, scoring='neg_root_mean_squared_error', cv=3)
grid_knn_rndForest.fit(X, y)

RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('ct',
                                              ColumnTransformer(transformers=[('cat',
                                                                               Pipeline(steps=[('imp',
                                                                                                SimpleImputer(fill_value='missing',
                                                                                                              strategy='constant')),
                                                                                               ('encoder',
                                                                                                OneHotEncoder(handle_unknown='ignore'))]),
                                                                               Index(['SellerG', 'Method', 'CouncilArea', 'Suburb', 'Regionname', 'Type'], dtype='object')),
                                                  

In [17]:
grid_knn_rndForest.best_score_

-316483.19928513764

In [18]:
grid_knn_rndForest.best_params_

{'est__n_estimators': 500,
 'ct__knn__imp__weights': 'uniform',
 'ct__knn__imp__n_neighbors': 3}

In [19]:
np.set_printoptions(suppress=True)
y_pred = grid_knn_rndForest.predict(df_test)

In [20]:
df = pd.DataFrame(y_pred, columns=['Price'])
df.index.name = 'ID'

In [21]:
df.to_csv("OlcayCezayirSubmissionV10.csv")