In [1]:
from sklearn.metrics import mean_squared_error
from math import sqrt # rms = sqrt(mean_squared_error(y_true, y_predicted))
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd

# input dataset
rew = pd.read_csv("REW_dataset.csv")

#get required columns
cols = list(rew.columns)

type_list = cols[cols.index('House'):cols.index('Multifamily')+1]

area_list = cols[cols.index('Whalley'):cols.index('Pitt Meadows Rural')+1]

postal_list = cols[cols.index('V3S'):cols.index('V0Y')+1]

features_list = cols[cols.index('Drapes/window Coverings'):cols.index('Vacuum Blt. In')+1]

In [2]:
# subset area df=df['area'==]
# type_list has house types
# cols used
req_cols = ['price','listing_id','bed','bath','area_sqft','fireplaces']+type_list

rew = rew[req_cols]

# Data Used for recommendation
rew.head(2)

Unnamed: 0,price,listing_id,bed,bath,area_sqft,fireplaces,House,Apt/Condo,Townhouse,Land/Lot,Duplex,Mfd/Mobile Home,Multifamily
0,399800,R2333259,2,2.0,1208.0,1.0,0,0,1,0,0,0,0
1,2550000,R2277753,4,6.0,3312.0,1.0,1,0,0,0,0,0,0


In [3]:
#Input Data Creation

samp_price = 399800

samp_bed = 3 

samp_bath = 2

samp_area_sqft = 1300

samp_fireplaces = 1

# use a drop down - type of house
samp_ptype = "House"

sample_df = pd.DataFrame({"price":samp_price,"bed":samp_bed,"bath":samp_bath,\
                          "area_sqft":samp_area_sqft,"fireplaces":samp_fireplaces},index=[0])

sample_df[samp_ptype]=1

type_cols = set(type_list)-(set({samp_ptype}))

for col in type_cols:
    sample_df[col]=0

#Columns Used - reorder  
sample_df = sample_df[['price', 'bed', 'bath', 'area_sqft', 'fireplaces', 'House', 'Apt/Condo',
       'Townhouse', 'Land/Lot', 'Duplex', 'Mfd/Mobile Home', 'Multifamily']]

#Input DF    
sample_df

Unnamed: 0,price,bed,bath,area_sqft,fireplaces,House,Apt/Condo,Townhouse,Land/Lot,Duplex,Mfd/Mobile Home,Multifamily
0,399800,3,2,1300,1,1,0,0,0,0,0,0


In [4]:
# rew data without listing_id
sim_data = rew[['price','listing_id','bed','bath','area_sqft','fireplaces']+type_list]
sim_data.set_index('listing_id',inplace=True)

#Add this row on top of all records
sim_numeric = pd.concat([sample_df, sim_data], axis=0)

#Normalize
sim_normal = (sim_numeric - sim_numeric.min()) / (sim_numeric.max() - sim_numeric.min())

#Normalized sample
normal_sample = sim_normal.iloc[0,:]

#normalized sim_data
sim_normal = sim_normal.iloc[1:,:]

In [9]:
from sklearn.neighbors import NearestNeighbors

#train model
nbrs = NearestNeighbors(n_neighbors=30, metric='euclidean',algorithm='auto').fit(sim_normal)

In [None]:
import pickle
# now you can save it to a file
with open('knn_model.pkl', 'wb') as f:
    pickle.dump(nbrs, f)

In [11]:
# and later you can load it
with open('knn_model.pkl', 'rb') as f:
    loaded_model_knn = pickle.load(f)

In [12]:
# get model results on smaple data
vals = loaded_model_knn.kneighbors([normal_sample])

# get indices
indices = list(vals[1][0])

In [30]:
sample_df

Unnamed: 0,price,bed,bath,area_sqft,fireplaces,House,Apt/Condo,Townhouse,Land/Lot,Duplex,Mfd/Mobile Home,Multifamily
0,399800,3,2,1300,1,1,0,0,0,0,0,0


In [29]:
# KNN - 30 Similar records
sim_data.iloc[indices,:].head()

Unnamed: 0_level_0,price,bed,bath,area_sqft,fireplaces,House,Apt/Condo,Townhouse,Land/Lot,Duplex,Mfd/Mobile Home,Multifamily
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
R2322768,435000,3,2.0,1352.0,1.0,1,0,0,0,0,0,0
R2347609,599998,3,2.0,1350.0,1.0,1,0,0,0,0,0,0
R2329021,689500,3,2.0,1270.0,1.0,1,0,0,0,0,0,0
R2342852,688900,3,2.0,1254.0,1.0,1,0,0,0,0,0,0
R2338978,719000,3,2.0,1325.0,1.0,1,0,0,0,0,0,0


In [20]:
# required listing Ids
req_list_ids = sim_data.index.values
print(req_list_ids)

['R2333259' 'R2277753' 'R2331580' ... 'R2335334' 'R2295410' 'R2338949']
