In [2]:
#importing pandas as pd
import pandas as pd

#reading in the file
df = pd.read_csv("dc_airbnb.csv")

#Picking the columns we need, hence a double square bracket
df= df[["bedrooms", "bathrooms", "beds", "price", "state"]]

#Cleaning the price columns, removing $ sign and commas, because it will not be possible to make it into a 
#float without cleaning this up
df["price"] = df["price"].str.replace("$", "").str.replace(",", "")
df["price"] = df["price"].astype(float)

#Breaking down the state column into a one hot encoder and adding this one-hot encoded df to our original df
df = pd.concat([df, pd.get_dummies(df["state"])],1)
#Dropping state now, as we have the one-hot encoded df
df = df.drop(["state"],1)

#Trimming down the original df with 3000+ rows to 11 rows
df = df[:11]

#Creating a copy of this df, Haven't dropped price, because we will need it later
df_1 = df.copy()

#Now dropping price, because it's a dependent variable, and we would not want to use this column to predict price, 
#Intuitively, why would you use the salary column to predict salary, you would use the columns other than Salary
#to predict salary
df = df.drop(["price"],1)

#The chap we want to predict the price for. (We also do have his real price by the way, because he was a part of the 
#original df anyway)
eleventh_guy = df.iloc[10]

#To calculate euclidean distance we import this one

from scipy.spatial import distance
#The idea is now to calculate the distnce of the 11th guy from every other chap in the df, which are 10 guys. That's
#what the for loop is doing, for x in range(df.shape[0]) means it will take only upto the 11th row, which is the number
#of rows in our df now anyway, d is the distance of the eleventh guy from every guy in our dataframe, hence df.iloc[x]
# append all these distances to a list called distances
try:
    distances=[]
    for x in range(df.shape[0]):
        d = distance.euclidean(df.iloc[x],eleventh_guy)
        distances.append(d)
except:
    pass
distances.append(1.5)
distances.append(0)
#Used try except, as there are a few NaNs, and you cannot calculate distance between 2 points, 
#if data in one point is missing. Hence appending 1.5 and 0 to our list distances, to make sure that there are exactly
#11 distances as there are rows. Hence every element in the distances column stands for the distance of the eleventh 
#guy from the user at that index. 0 because 11th guy from 11th guy is 0. 1.5 is arbirtrary. 


users = list(range(10))
#Users is nothing more but the index or the guys in the 10 rows displayed sequentially as a list

#Making that into a dictionary, with keys being users, and values being the distances of those users from our
#eleventh guy
d = dict(zip(users, distances))


#Creating a new column in our df with the price with the distances correposponding to every use, since it was
#order anyway, we can directly go ahead and add this. 
df_1["distances"] = distances

#Sorting the entire dataframe based on the distances column only. 
df_1 = df_1.sort_values('distances')

In [3]:
df_1

Unnamed: 0,bedrooms,bathrooms,beds,price,DC,MD,NY,VA,Washington DC,distances
10,2.0,1.5,2.0,97.0,0,1,0,0,0,0.0
8,1.0,1.5,1.0,38.0,0,1,0,0,0,1.414214
2,1.0,2.0,1.0,50.0,0,1,0,0,0,1.5
4,1.0,1.0,1.0,50.0,0,1,0,0,0,1.5
6,2.0,2.0,2.0,100.0,1,0,0,0,0,1.5
7,1.0,1.0,1.0,100.0,0,1,0,0,0,1.5
9,1.0,,1.0,71.0,0,1,0,0,0,1.5
0,1.0,1.0,2.0,160.0,1,0,0,0,0,1.802776
3,1.0,1.0,1.0,95.0,1,0,0,0,0,2.061553
1,3.0,3.0,3.0,350.0,1,0,0,0,0,2.5


In [4]:
#now the idea is to predict the price for the 11th guy depending on the number of neighbors you choose. 
#So if you chose k = 0, no one would come in, , if now k=1, the first neighbor closest to user 11 or our eleventh guy 
# is user 8, his price is 38, hence predeicted price for 11th guy or user 10 now would be 38

#If k = 2, the closest neighbors based on distance are user 8 and user 2. 
#however user4, 6, 7, 9 also have the same distances, hence we will have to take those fellows also into account
#So even though you specified that the number of neighbors you intend to take up is 2, because distances for the 
#second closest guy was 1.5 which is also the case for the users 4,6,7 and 9, we will have to include them as well

#So now our mean would be the mean of prices for user 8,2,4,6,7 and 9. 



In [12]:

x = df_1["distances"].unique()#because distances have similar values, it is imperative to figure out the unique ones
#condition 1 - distance should be greater than 0
#(df_1["distances"]>0)
#condition 2 - all distances within the specified value of k should be taken
#If k was 3
print (x[3])
print ("\n")
print (x[1])


1.8027756377319946


1.4142135623730951


In [14]:
#Now we write our function but before that play around with this one, to get what exactly the code should follow. 

k = 5
x = list(df_1['distances'][:10].unique())
ok = df_1[(df_1["distances"]>0) & (df_1["distances"]<=x[k])]
ok["price"].mean()

111.3

In [23]:
def kn1(k):
    x = list(df_1['distances'][:10].unique())
    x.sort()
    prd = df_1[(df_1['distances']>0) & (df_1['distances']<=x[k])]['price'].mean()
    return prd

In [24]:
df_1

Unnamed: 0,bedrooms,bathrooms,beds,price,DC,MD,NY,VA,Washington DC,distances
10,2.0,1.5,2.0,97.0,0,1,0,0,0,0.0
8,1.0,1.5,1.0,38.0,0,1,0,0,0,1.414214
2,1.0,2.0,1.0,50.0,0,1,0,0,0,1.5
4,1.0,1.0,1.0,50.0,0,1,0,0,0,1.5
6,2.0,2.0,2.0,100.0,1,0,0,0,0,1.5
7,1.0,1.0,1.0,100.0,0,1,0,0,0,1.5
9,1.0,,1.0,71.0,0,1,0,0,0,1.5
0,1.0,1.0,2.0,160.0,1,0,0,0,0,1.802776
3,1.0,1.0,1.0,95.0,1,0,0,0,0,2.061553
1,3.0,3.0,3.0,350.0,1,0,0,0,0,2.5


In [25]:
#If k was 1, price would be 38
pred = kn1(1)
pred


38.0

In [26]:
#if k was 2, users, 2,4,6,7,9 and the original nearest neighbor at 8. 6 chaps, mean price calculated manually
(38+50+50+100+100+71)/6

68.16666666666667

In [27]:
kn1(2)

68.16666666666667

In [None]:
#We are good to go. 

