In [76]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [77]:
'''To read data from diamonds.csv'''
headers = ["carat",	"cut","color","clarity","depth","table","price","x","y","z"]
data = pd.read_csv('Regression_Diamonds_data/diamonds.csv', na_values=np.NaN,    
         header=None,  names = headers) 
data = data.reset_index(drop=True)
data = data.iloc[1:]
data = data[:1000]
print(data.describe())
print(data.head()) 

        carat    cut  color clarity  depth  table  price      x      y      z
count   53940  53940  53940   53940  53940  53940  53940  53940  53940  53940
unique    273      5      7       8    184    127  11602    554    552    375
top       0.3  Ideal      G     SI1     62     56    605   4.37   4.34    2.7
freq     2604  21551  11292   13065   2239   9881    132    448    437    767
  carat      cut color clarity depth table price     x     y     z
1  0.23    Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
2  0.21  Premium     E     SI1  59.8    61   326  3.89  3.84  2.31
3  0.23     Good     E     VS1  56.9    65   327  4.05  4.07  2.31
4  0.29  Premium     I     VS2  62.4    58   334   4.2  4.23  2.63
5  0.31     Good     J     SI2  63.3    58   335  4.34  4.35  2.75


In [78]:
'''Check for NaNs'''
for h in headers:
    print(h,"NaN :",data[h].isnull().sum())
    


carat NaN : 0
cut NaN : 0
color NaN : 0
clarity NaN : 0
depth NaN : 0
table NaN : 0
price NaN : 0
x NaN : 0
y NaN : 0
z NaN : 0


In [79]:
'''Convert categorical attributes to numeric'''
cat_to_num = {"cut":{"Fair":0, "Good":1, "Very Good":2, "Premium":3, "Ideal":4}, 
              "clarity":{"I1":0, "SI2":1, "SI1":2, "VS2":3, "VS1":4, "VVS2":5, "VVS1":6, "IF":7},
              "color":{"J":0, "I":1, "H":2, "G":3, "F":4, "E":5, "D":6}}
data = data.replace(cat_to_num)
print(data.head())
# data = pd.get_dummies(data,columns=['color'])

  carat  cut  color  clarity depth table price     x     y     z
1  0.23    4      5        1  61.5    55   326  3.95  3.98  2.43
2  0.21    3      5        2  59.8    61   326  3.89  3.84  2.31
3  0.23    1      5        4  56.9    65   327  4.05  4.07  2.31
4  0.29    3      1        3  62.4    58   334   4.2  4.23  2.63
5  0.31    1      0        1  63.3    58   335  4.34  4.35  2.75


In [80]:
for h in headers:
    data[h] = pd.to_numeric(data[h],downcast='float')

# data

In [81]:
'''Separating X & y'''
X = data.drop('price', axis=1)
y = data.price
print(X.head())
print(y.head())

   carat  cut  color  clarity      depth  table     x     y     z
1   0.23  4.0    5.0      1.0  61.500000   55.0  3.95  3.98  2.43
2   0.21  3.0    5.0      2.0  59.799999   61.0  3.89  3.84  2.31
3   0.23  1.0    5.0      4.0  56.900002   65.0  4.05  4.07  2.31
4   0.29  3.0    1.0      3.0  62.400002   58.0  4.20  4.23  2.63
5   0.31  1.0    0.0      1.0  63.299999   58.0  4.34  4.35  2.75
1    326.0
2    326.0
3    327.0
4    334.0
5    335.0
Name: price, dtype: float32


In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=1)

### add standardize experiment

In [110]:
'''Minkowski distance function'''
def minkowski_dist(x1, x2, p=1):
    dim = len(x1)
    dist = 0
    for d in range(dim):
#         print(x1[d],type(x1[d]))
        dist += abs(x1[d] - x2[d])**p
    dist = dist**(1/p)
    return dist

# print(minkowski_dist(X.iloc[0],X.iloc[1],2))

In [None]:
all_dists = []
def cal_all_dists():
    for i in X_test.index:
        cur_dists = []
#         print(X_test.loc[i])
        for j in X_train.index:
            cur_dists.append(minkowski_dist(X_test.loc[i],X_train.loc[j]))
        df_cur_dists = pd.DataFrame(data=cur_dists, columns=['dist'], index=y_train.index)
        df_sorted_curdists = df_cur_dists.sort_values(by=['dist'], axis=0)
        all_dists.append(df_sorted_curdists)

cal_all_dists()
print(all_dists)


In [105]:
print(X_test.iloc[0])

carat       0.330000
cut         4.000000
color       2.000000
clarity     2.000000
depth      61.700001
table      55.000000
x           4.430000
y           4.460000
z           2.740000
Name: 2715, dtype: float32
