<a href="https://colab.research.google.com/github/patrick-seib/ontario-house-prices/blob/main/HousePriceDataCleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive
import folium

drive.mount('/content/drive/')
df = pd.read_csv('/content/drive/My Drive/416_Data/properties.csv')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


After loading the data in, it must be shaped such I'm able to work with it later.

In [21]:
df.drop(columns='Unnamed: 0', inplace=True)
df.rename(columns={'Price ($)':"Price"}, inplace=True)
print(df.head())
print(df.shape)

                            Address           AreaName   Price        lat  \
0       86 Waterford Dr Toronto, ON           Richview  999888  43.679882   
1  #80 - 100 BEDDOE DR Hamilton, ON     Chedoke Park B  399900  43.250000   
2    213 Bowman Street Hamilton, ON  Ainslie Wood East  479000  43.251690   
3      102 NEIL Avenue Hamilton, ON          Greenford  285900  43.227161   
4   #1409 - 230 King St Toronto, ON           Downtown  362000  43.651478   

         lng  
0 -79.544266  
1 -79.904396  
2 -79.919357  
3 -79.767403  
4 -79.368118  
(25351, 5)


In [22]:
def remove_outliers(data, *cols):
    for col in cols:
        q1 = data[col].quantile(q = 0.25)
        q3 = data[col].quantile(q = 0.75) 
        iqr = q3 - q1
        t1 = q1 - 3 * iqr
        t2 = q3 + 3 * iqr
        data = data[(data[col] > t1) & (data[col] < t2)]
    return data
    
data_c = remove_outliers(df, 'Price','lng', 'lat')

In [23]:
# Seperate data with/without null:
data_withna = data_c.loc[data_c['AreaName'].isna()]
data_wona = data_c.loc[data_c['AreaName'].notna()]
print(data_withna.shape, data_wona.shape)

# Split train/test dataset:
from sklearn.model_selection import train_test_split
training, testing = train_test_split(data_wona,test_size=0.25, random_state=1)

x_train = training.copy()[['lat','lng']]
y_train = training.copy()['AreaName']

x_test = testing.copy()[['lat','lng']]
y_test = testing.copy()['AreaName']

print(training.shape)
print(testing.shape)

(296, 5) (23497, 5)
(17622, 5)
(5875, 5)


In [24]:
# Build Random Forest Classifier workflow:
from sklearn.preprocessing import PowerTransformer, StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier 
rf_model = RandomForestClassifier(criterion='gini',
                                  n_jobs=16,
                                  max_features = 'auto',
                                  n_estimators = 100,
                                  max_depth = None,
                                  random_state=133)
pipe_clf = Pipeline([
        ('sc',StandardScaler()),
        ('power_trans',PowerTransformer()),
        ('polynom_trans',PolynomialFeatures(degree=2)),
        ('rf_clf', rf_model)
        ])

# Fit model & Check the accuracy score of the random forest model:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
pipe_clf.fit(x_train,y_train)
y_predict = pipe_clf.predict(x_test)
print(f'accuracy score is: {accuracy_score(y_test,y_predict)}')

  warn(


accuracy score is: 0.918468085106383


In [25]:
data_withna['AreaName'] = pipe_clf.predict(data_withna[['lat','lng']])
data_withna.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_withna['AreaName'] = pipe_clf.predict(data_withna[['lat','lng']])


Unnamed: 0,Address,AreaName,Price,lat,lng
209,"7393 Wellington Rd 51 . Ariss, ON",Guelph,699000,43.596718,-80.34066
1186,"152 MARR Drive Elora, ON",Elora,399900,43.693314,-80.437492
2773,"17 Dan Sheehan Lane Clarington, ON",Bowmanville,550000,43.931091,-78.707405
2874,328 Wicklow Beach Rd Alnwick Haldimand Townshi...,Colborne,629000,43.976543,-77.962921
2936,"WALKER LAKE DR WALKER LAKE DR Lake Of Bays, ON",Huntsville,69900,45.378361,-79.092339


In [26]:
data_f = pd.concat([data_wona, data_withna])

data_g = data_f.groupby('AreaName').mean()
data_g['Count'] = data_f.groupby('AreaName').count()['lat']
mean_lat = data_g['lat'].mean()
mean_lng = data_g['lng'].mean()

data_g.sort_values('Price', ascending=False, inplace=True)

print(data_g)

                                          Price        lat        lng  Count
AreaName                                                                    
Winona                             1.595000e+06  43.213367 -79.656593      1
Armour Heights                     1.590000e+06  43.743423 -79.426872      1
Yonge and Lawrence                 1.589000e+06  43.728615 -79.402267      1
Oro Station                        1.525000e+06  44.423679 -79.549232      1
Old Mill                           1.524333e+06  43.651253 -79.493347      3
...                                         ...        ...        ...    ...
Manitowaning                       1.499900e+04  45.739494 -81.807777      1
Trillium Industrial Park           3.800000e+03  43.414257 -80.449989      1
Bishop Hellmuth Heritage District  1.883333e+03  42.996292 -81.251495      3
North End West                     0.000000e+00  43.269379 -79.865402      1
Port Lands                         0.000000e+00  43.647285 -79.351509      1

  data_g = data_f.groupby('AreaName').mean()


In [28]:
filename = 'cleanedHousePrices.csv'

data_g.to_csv('/content/drive/MyDrive/416_Data/' + filename)