# Regression Assignment

In [9]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import math

### Training Data Loading

In [10]:
train_data = pd.read_csv('trainHome_data.csv')
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [11]:
train_data.drop(['id', 'date'], axis = 1, inplace = True)

In [12]:
train_data['basement_present'] = train_data['sqft_basement'].apply(lambda x: 1 if x > 0 else 0) 
train_data['renovated'] = train_data['yr_renovated'].apply(lambda x: 1 if x > 0 else 0) 

In [13]:
categorial_cols = ['floors', 'view', 'condition', 'grade']

for cc in categorial_cols:
    dummies = pd.get_dummies(train_data[cc], drop_first=False)
    dummies = dummies.add_prefix("{}#".format(cc))
    train_data.drop(cc, axis=1, inplace=True)
    train_data = train_data.join(dummies)

In [14]:
dummies_zipcodes = pd.get_dummies(train_data['zipcode'], drop_first=False)
dummies_zipcodes.reset_index(inplace=True)
dummies_zipcodes = dummies_zipcodes.add_prefix("{}#".format('zipcode'))
dummies_zipcodes = dummies_zipcodes[['zipcode#98004','zipcode#98102','zipcode#98109','zipcode#98112','zipcode#98039','zipcode#98040']]
train_data.drop('zipcode', axis=1, inplace=True)
train_data = train_data.join(dummies_zipcodes)

train_data.dtypes

price                 int64
bedrooms              int64
bathrooms           float64
sqft_living           int64
sqft_lot              int64
waterfront            int64
sqft_above            int64
sqft_basement         int64
yr_built              int64
yr_renovated          int64
lat                 float64
long                float64
sqft_living15         int64
sqft_lot15            int64
basement_present      int64
renovated             int64
floors#1.0            uint8
floors#1.5            uint8
floors#2.0            uint8
floors#2.5            uint8
floors#3.0            uint8
floors#3.5            uint8
view#0                uint8
view#1                uint8
view#2                uint8
view#3                uint8
view#4                uint8
condition#1           uint8
condition#2           uint8
condition#3           uint8
condition#4           uint8
condition#5           uint8
grade#3               uint8
grade#4               uint8
grade#5               uint8
grade#6             

In [None]:
def nominal_diff(nominal_train_data,nominal_test_obj):
    p=nominal_train_data.shape[1]
    m=0
    nominal_table=np.zeros_like(nominal_train_data[0])
    for i in enumerate(nominal_train_data.shape[0]):
        m=0
        for j in enumerate(nominal_train_data.shape[1]): 
            if nominal_train_data[i][j]==nominal_test_obj[j]:
                m+=1
        nominal_table[i]=(p-m)/p
    return nominal_table

In [None]:
def binary_diff(binary_train_data,binary_test_obj):
    diff=0,same_1=0
    binary_table=np.zeros_like(binary_train_data[0])
    for i in enumerate(binary_train_data.shape[0]):
        for j in enumerate(binary_train_data.shape[1]): 
            if binary_train_data[i][j]==1 && binary_test_obj[j]==1:
                same_1+=1
            else if binary_train_data[i][j]!=binary_test_obj[j]:
                diff+=1
        binary_table[i]=diff/(same_1+diff)
    return binary_table

In [None]:
def manhattan_distance(numeric_train_data,numeric_test_obj):
    man_dist=0
    manhattan_dist=np.zeros_like(numeric_train_data[0])
    for i in enumerate(numeric_train_data.shape[0]):
        for j in enumerate(numeric_train_data.shape[1]): 
            man_dist=man_dist+abs(numeric_train_data[i][j]-numeric_test_obj[j])             
        manhattan_dist[i]=man_dist
    return manhattan_dist

In [None]:
def euclidean_distance(numeric_train_data,numeric_test_obj):
    eu_dist=0
    eu_distance=np.zeros_like(numeric_train_data[0])
    for i in enumerate(numeric_train_data.shape[0]):
        for j in enumerate(numeric_train_data.shape[1]): 
            eu_dist=eu_dist+math.pow((numeric_train_data[i][j]-numeric_test_obj[j]),2)            
        eu_distance[i]=math.sqrt(eu_dist)
    return eu_distance

In [None]:
def calculate_distances(train_data,test_obj):
    nominal=nominal_diff(nominal_train_data,nominal_test_obj)
    binary=binary_diff(binary_train_data,binary_test_obj)
    man_d=manhattan_distance(numeric_train_data,numeric_test_obj)
    eu_d=euclidean_distance(numeric_train_data,numeric_test_obj)
    
    manhattan=np.concatenate((nominal,binary,man_d),axis=1)
    euclidean=np.concatenate((nominal,binary,eu_d),axis=1)
    
    return manhattan,euclidean

In [None]:
def KNN(train_data,test_obj,n):
    manhattan,euclidean=calculate_distances(train_data,test_obj)
    
    #manhattan=np.sort(manhattan)
    #euclidean=np.sort(euclidean)
    
    min_m_indexes=manhattan.argsort()[:n]
    min_e_indexes=euclidean.argsort()[:n]