### Step - 1: Load the data

In [38]:
import pandas as pd
import numpy as np

In [39]:
data=pd.read_csv(r'C:\Users\Sireesha Peruri\Downloads\diamonds (1).csv')

In [40]:
data.shape

(53940, 10)

In [41]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


### Step - 2: Identify input and output variables

In [42]:
X = data.drop('price', axis=1)
y = data['price']

In [43]:
X.shape,y.shape

((53940, 9), (53940,))

### Step - 3: Split the data - Test and Train 

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42 )
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((40455, 9), (13485, 9), (40455,), (13485,))

In [45]:
data['cut'].value_counts()

Ideal        21551
Premium      13791
Very Good    12082
Good          4906
Fair          1610
Name: cut, dtype: int64

### Step - 4: Data Preprocessing on X_train

In [46]:
from sklearn.preprocessing import OrdinalEncoder

In [47]:
data['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [48]:
data['color'].unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [49]:
data['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [50]:
cut_order = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_order = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_order = ['I1', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2', 'IF']
oe = OrdinalEncoder(categories = [cut_order, color_order, clarity_order])
oe

In [51]:
from sklearn.preprocessing import MinMaxScaler


In [52]:
minmax = MinMaxScaler().set_output(transform = 'pandas')
minmax

In [53]:
from sklearn.compose import ColumnTransformer

In [54]:
ct = ColumnTransformer([('oe', oe, ['cut','color','clarity']),
                        ('minmax',minmax,['depth','carat','table'])],
                       remainder = 'drop').set_output(transform = 'pandas')
ct

In [55]:
ct.fit_transform(data[:])

Unnamed: 0,oe__cut,oe__color,oe__clarity,minmax__depth,minmax__carat,minmax__table
0,4.0,1.0,2.0,0.513889,0.006237,0.230769
1,3.0,1.0,1.0,0.466667,0.002079,0.346154
2,1.0,1.0,3.0,0.386111,0.006237,0.423077
3,3.0,5.0,4.0,0.538889,0.018711,0.288462
4,1.0,6.0,2.0,0.563889,0.022869,0.288462
...,...,...,...,...,...,...
53935,4.0,0.0,1.0,0.494444,0.108108,0.269231
53936,1.0,0.0,1.0,0.558333,0.108108,0.230769
53937,2.0,0.0,1.0,0.550000,0.103950,0.326923
53938,3.0,4.0,2.0,0.500000,0.137214,0.288462


In [56]:
X_train_transformed = ct.fit_transform(X_train)
X_train_transformed.head()

Unnamed: 0,oe__cut,oe__color,oe__clarity,minmax__depth,minmax__carat,minmax__table
35965,1.0,1.0,6.0,0.608333,0.010395,0.288462
52281,4.0,6.0,1.0,0.522222,0.133056,0.25
6957,3.0,6.0,4.0,0.502778,0.176715,0.288462
9163,4.0,2.0,2.0,0.491667,0.170478,0.25
50598,4.0,2.0,3.0,0.522222,0.085239,0.269231


### Step - 5: Data Preprocessing on X_test 

In [57]:
X_test_transformed = ct.transform(X_test)
X_test_transformed.head()

Unnamed: 0,oe__cut,oe__color,oe__clarity,minmax__depth,minmax__carat,minmax__table
1388,4.0,3.0,5.0,0.530556,0.008316,0.25
50052,2.0,2.0,6.0,0.472222,0.079002,0.269231
41645,4.0,1.0,6.0,0.530556,0.04158,0.230769
42377,3.0,1.0,6.0,0.494444,0.047817,0.269231
17244,4.0,1.0,2.0,0.536111,0.280665,0.230769


### Step - 6: Build the model and predict on X_test

In [58]:
query=X_train_transformed.sample(1)
query

Unnamed: 0,oe__cut,oe__color,oe__clarity,minmax__depth,minmax__carat,minmax__table
18315,2.0,2.0,3.0,0.525,0.16632,0.288462


In [59]:
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import pairwise_distances


In [60]:
dist=euclidean_distances(X_train_transformed,query)
dist



array([[3.32133363],
       [4.89924418],
       [4.24271162],
       ...,
       [4.00267134],
       [4.5827314 ],
       [2.23664554]])

In [61]:
nearest= np.argsort(dist, axis = 0)
nearest

array([[ 7456],
       [22522],
       [ 2445],
       ...,
       [30460],
       [12428],
       [ 2491]], dtype=int64)

In [62]:
top5_np=nearest[:5].flatten()
top5_np

array([ 7456, 22522,  2445, 39812, 23164], dtype=int64)

In [63]:
y[top5_np].mean()

6051.4

In [71]:
def euclidean_distance(x_test, X_train):
    return np.sqrt(np.sum((X_train - x_test) ** 2, axis=1))

y_new= []
k = 5
for i in range(X_test_transformed.shape[0]):

    distances = euclidean_distance(
        X_test_transformed.iloc[i].values,
        X_train_transformed.values
    )
    knn_indices = np.argsort(distances)[:k]
    y_pred = np.mean(y_train.iloc[knn_indices])
    y_new.append(y_pred)


In [73]:
y_pred2 = knn.predict(X_test_transformed)
y_pred2

array([ 559, 1891, 1034, ..., 6543, 4497, 4370], dtype=int64)

In [75]:
nearest= np.argsort(dist, axis = 0)
nearest

array([[ 7456],
       [22522],
       [ 2445],
       ...,
       [30460],
       [12428],
       [ 2491]], dtype=int64)

In [79]:
top5=nearest[:5].flatten()
top5

array([ 7456, 22522,  2445, 39812, 23164], dtype=int64)

In [80]:
y[top5].mean()

6051.4

 ### Step - 7: Evaluate your model

In [85]:
from sklearn.metrics import accuracy_score

In [86]:
from sklearn.metrics import r2_score,mean_squared_error

In [90]:
mean_squared_error(y_pred2,y_test)

1924555.043233222

### Step - 8: Train a model using sklearn KNN Algorithm

In [72]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=5,metric='minkowski') 
knn

In [None]:
knn.fit(X_train_transformed, y_train)

In [74]:
y_pred_knn = knn.predict(X_test_transformed)
y_pred_knn

array([ 559, 1891, 1034, ..., 6543, 4497, 4370], dtype=int64)

In [81]:
nearest= np.argsort(dist, axis = 0)
nearest

array([[ 7456],
       [22522],
       [ 2445],
       ...,
       [30460],
       [12428],
       [ 2491]], dtype=int64)

In [82]:
top5=nearest[:5].flatten()
top5

array([ 7456, 22522,  2445, 39812, 23164], dtype=int64)

In [83]:
y[top5].mean()

6051.4

In [91]:
mean_squared_error(y_pred_knn,y_test)

1924555.043233222