## Import the required Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics

## Load the Data

In [2]:
df = pd.read_csv(r"D:\Innomatics\Machine Learning\Project\Data Set\diamonds.csv")

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [4]:
df.shape

(53940, 10)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [6]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

### Machine Learning Problem

**Build a system which can take features of diamond like carat, cut, color, clarity, x, y, z, etc.. and `predicts the price of diamond`.**

Target Variable: ______

In [7]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [8]:
df = df[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'price']]

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,334
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335


### a. Identify the Target Variable and Splitting the Data into Train and Test

`!pip install -U scikit-learn`

In [9]:
# !pip install -U scikit-learn

In [10]:
import sklearn

print(sklearn.__version__)

# For some of the code below, make sure you have sklearn version 1.1 or above.

1.1.3


In [11]:
# Identifying the inputs (X) and output (y)

y = df['price']

X = df[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']]

In [12]:
# split into train and test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=100)

In [13]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
33169,0.3,Ideal,D,VS1,61.9,54.1,4.28,4.33,2.67
53170,0.71,Premium,E,SI1,59.5,62.0,5.85,5.74,3.45
27846,0.3,Ideal,D,VS1,62.2,56.0,4.27,4.31,2.67
20880,1.02,Premium,F,VVS2,62.9,59.0,6.42,6.37,4.02
28554,0.3,Premium,H,VS1,60.1,61.0,4.32,4.3,2.59


In [14]:
print(X_train.shape, y_train.shape)

print(X_test.shape, y_test.shape)

(40455, 9) (40455,)
(13485, 9) (13485,)


### b. Separating Categorical and Numerical Columns: 

In [15]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
33169,0.3,Ideal,D,VS1,61.9,54.1,4.28,4.33,2.67
53170,0.71,Premium,E,SI1,59.5,62.0,5.85,5.74,3.45
27846,0.3,Ideal,D,VS1,62.2,56.0,4.27,4.31,2.67
20880,1.02,Premium,F,VVS2,62.9,59.0,6.42,6.37,4.02
28554,0.3,Premium,H,VS1,60.1,61.0,4.32,4.3,2.59


In [16]:
X_train.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
dtype: object

In [17]:
X_train_cat = X_train.select_dtypes(include=['object'])

X_train_cat.head()

Unnamed: 0,cut,color,clarity
33169,Ideal,D,VS1
53170,Premium,E,SI1
27846,Ideal,D,VS1
20880,Premium,F,VVS2
28554,Premium,H,VS1


In [18]:
X_train_num = X_train.select_dtypes(include=['int64', 'float64'])

X_train_num.head()

Unnamed: 0,carat,depth,table,x,y,z
33169,0.3,61.9,54.1,4.28,4.33,2.67
53170,0.71,59.5,62.0,5.85,5.74,3.45
27846,0.3,62.2,56.0,4.27,4.31,2.67
20880,1.02,62.9,59.0,6.42,6.37,4.02
28554,0.3,60.1,61.0,4.32,4.3,2.59


### c. Scaling the Numerical Features

In [19]:
X_train_num.head()

Unnamed: 0,carat,depth,table,x,y,z
33169,0.3,61.9,54.1,4.28,4.33,2.67
53170,0.71,59.5,62.0,5.85,5.74,3.45
27846,0.3,62.2,56.0,4.27,4.31,2.67
20880,1.02,62.9,59.0,6.42,6.37,4.02
28554,0.3,60.1,61.0,4.32,4.3,2.59


In [20]:
# scaling the numerical features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# column names are (annoyingly) lost after Scaling
# (i.e. the dataframe is converted to a numpy ndarray)

X_train_num_rescaled = pd.DataFrame(scaler.fit_transform(X_train_num), 
                                    columns = X_train_num.columns, 
                                    index = X_train_num.index)

X_train_num_rescaled.head()

Unnamed: 0,carat,depth,table,x,y,z
33169,-1.050294,0.099874,-1.495687,-1.293573,-1.220387,-1.249413
53170,-0.186302,-1.57225,2.031741,0.10509,0.003601,-0.128311
27846,-1.050294,0.30889,-0.647318,-1.302482,-1.237749,-1.249413
20880,0.46696,0.796592,0.692211,0.612885,0.550489,0.690955
28554,-1.050294,-1.154219,1.585231,-1.257938,-1.24643,-1.364397


In [21]:
X_train_num.describe()

Unnamed: 0,carat,depth,table,x,y,z
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,0.798408,61.756651,57.449729,5.732037,5.735852,3.539272
std,0.474547,1.435318,2.23962,1.122515,1.151986,0.695753
min,0.2,43.0,43.0,0.0,0.0,0.0
25%,0.4,61.1,56.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,5.7,5.71,3.52
75%,1.04,62.5,59.0,6.54,6.54,4.04
max,5.01,79.0,95.0,10.74,58.9,8.06


In [22]:
print("Number of Numerical Features:", scaler.n_features_in_)
print("Mean of each column:", scaler.mean_)
print("Std of each column:", np.sqrt(scaler.var_))

Number of Numerical Features: 6
Mean of each column: [ 0.79840811 61.7566506  57.44972933  5.73203658  5.73585218  3.53927178]
Std of each column: [0.47454133 1.43530039 2.23959225 1.1225007  1.15197211 0.6957443 ]


### d. Applying OneHotEncoding on Categorical Columns

In [23]:
X_train_cat['cut'].value_counts(normalize=True)

Ideal        0.400890
Premium      0.254233
Very Good    0.223804
Good         0.091633
Fair         0.029440
Name: cut, dtype: float64

In [24]:
X_train_cat['color'].value_counts(normalize=True)

G    0.209368
E    0.181560
F    0.179088
H    0.153207
D    0.123619
I    0.100556
J    0.052602
Name: color, dtype: float64

In [25]:
X_train_cat['clarity'].value_counts(normalize=True)

SI1     0.242640
VS2     0.226375
SI2     0.169596
VS1     0.153183
VVS2    0.094698
VVS1    0.067952
IF      0.032134
I1      0.013422
Name: clarity, dtype: float64

In [26]:
# OneHotEncoding the categorical features

from sklearn.preprocessing import OneHotEncoder

encoder_ = OneHotEncoder(drop='first', min_frequency=3000, sparse=False)

# column names are (annoyingly) lost after OneHotEncoding
# (i.e. the dataframe is converted to a numpy ndarray)

X_train_cat_ohe = pd.DataFrame(encoder_.fit_transform(X_train_cat), 
                               columns=encoder_.get_feature_names_out(X_train_cat.columns), 
                               index = X_train_cat.index)

X_train_cat_ohe.head()

Unnamed: 0,cut_Ideal,cut_Premium,cut_Very Good,cut_infrequent_sklearn,color_E,color_F,color_G,color_H,color_I,color_infrequent_sklearn,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS2,clarity_infrequent_sklearn
33169,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
53170,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27846,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
20880,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
28554,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [27]:
encoder_.categories_

[array(['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'], dtype=object),
 array(['D', 'E', 'F', 'G', 'H', 'I', 'J'], dtype=object),
 array(['I1', 'IF', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2'],
       dtype=object)]

In [28]:
encoder_.infrequent_categories_

[array(['Fair'], dtype=object),
 array(['J'], dtype=object),
 array(['I1', 'IF', 'VVS1'], dtype=object)]

In [29]:
# OneHotEncoding the categorical features

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse=False)

# column names are (annoyingly) lost after OneHotEncoding
# (i.e. the dataframe is converted to a numpy ndarray)

X_train_cat_ohe = pd.DataFrame(encoder.fit_transform(X_train_cat), 
                               columns=encoder.get_feature_names_out(X_train_cat.columns), 
                               index = X_train_cat.index)

X_train_cat_ohe.head()

Unnamed: 0,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
33169,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
53170,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
27846,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
20880,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
28554,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### e. Applying Label Encoding on Categorical Columns

In [30]:
X_train_cat_le = pd.DataFrame(index=X_train_cat.index)

X_train_cat_le.head()

33169
53170
27846
20880
28554


In [31]:
X_train_cat.cut.unique()

array(['Ideal', 'Premium', 'Very Good', 'Fair', 'Good'], dtype=object)

In [32]:
cut_encoder = {'Fair' : 1, 'Good' : 2, 'Very Good' : 3, 'Ideal' : 4, 'Premium' : 5}

X_train_cat_le['cut'] = X_train_cat['cut'].apply(lambda x : cut_encoder[x])

X_train_cat_le.head()

Unnamed: 0,cut
33169,4
53170,5
27846,4
20880,5
28554,5


In [33]:
X_train_cat.color.unique()

array(['D', 'E', 'F', 'H', 'G', 'I', 'J'], dtype=object)

In [34]:
color_encoder = {'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7}

X_train_cat_le['color'] = X_train_cat['color'].apply(lambda x : color_encoder[x])

X_train_cat_le.head()

Unnamed: 0,cut,color
33169,4,7
53170,5,6
27846,4,7
20880,5,5
28554,5,3


In [35]:
X_train_cat.clarity.unique()

array(['VS1', 'SI1', 'VVS2', 'SI2', 'IF', 'VS2', 'I1', 'VVS1'],
      dtype=object)

In [36]:
clarity_encoder = {'I1':1, 'SI2':2, 'SI1':3, 'VS2':4, 'VS1':5, 'VVS2':6, 'VVS1':7, 'IF':8}

X_train_cat_le['clarity'] = X_train_cat['clarity'].apply(lambda x : clarity_encoder[x])

X_train_cat_le.head()

Unnamed: 0,cut,color,clarity
33169,4,7,5
53170,5,6,3
27846,4,7,5
20880,5,5,6
28554,5,3,5


### f. Concatinating the Encoded Categorical Features and Rescaled Numerical Features:

In [37]:
X_train_transformed = pd.concat([X_train_num_rescaled, X_train_cat_le], axis=1)

X_train_transformed.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
33169,-1.050294,0.099874,-1.495687,-1.293573,-1.220387,-1.249413,4,7,5
53170,-0.186302,-1.57225,2.031741,0.10509,0.003601,-0.128311,5,6,3
27846,-1.050294,0.30889,-0.647318,-1.302482,-1.237749,-1.249413,4,7,5
20880,0.46696,0.796592,0.692211,0.612885,0.550489,0.690955,5,5,6
28554,-1.050294,-1.154219,1.585231,-1.257938,-1.24643,-1.364397,5,3,5


### g. Preparing Test Data

In [38]:
X_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
52264,0.57,Ideal,E,VS2,61.5,57.0,5.35,5.32,3.28
21073,1.16,Ideal,G,VS1,61.5,55.0,6.75,6.81,4.17
42161,0.51,Ideal,G,SI1,63.2,58.0,5.05,5.08,3.2
35974,0.42,Ideal,F,VS1,60.6,56.0,4.83,4.87,2.94
7641,0.8,Premium,G,IF,62.6,58.0,5.89,5.93,3.7


In [39]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13485 entries, 52264 to 52186
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    13485 non-null  float64
 1   cut      13485 non-null  object 
 2   color    13485 non-null  object 
 3   clarity  13485 non-null  object 
 4   depth    13485 non-null  float64
 5   table    13485 non-null  float64
 6   x        13485 non-null  float64
 7   y        13485 non-null  float64
 8   z        13485 non-null  float64
dtypes: float64(6), object(3)
memory usage: 1.0+ MB


In [40]:
X_test_cat = X_test.select_dtypes(include=['object'])

X_test_cat.head()

Unnamed: 0,cut,color,clarity
52264,Ideal,E,VS2
21073,Ideal,G,VS1
42161,Ideal,G,SI1
35974,Ideal,F,VS1
7641,Premium,G,IF


In [41]:
X_test_num = X_test.select_dtypes(include=['int64', 'float64'])

X_test_num.head()

Unnamed: 0,carat,depth,table,x,y,z
52264,0.57,61.5,57.0,5.35,5.32,3.28
21073,1.16,61.5,55.0,6.75,6.81,4.17
42161,0.51,63.2,58.0,5.05,5.08,3.2
35974,0.42,60.6,56.0,4.83,4.87,2.94
7641,0.8,62.6,58.0,5.89,5.93,3.7


In [42]:
X_test_num_rescaled = pd.DataFrame(scaler.transform(X_test_num), 
                                   columns = X_test_num.columns, 
                                   index = X_test_num.index)

X_test_num_rescaled.head()

Unnamed: 0,carat,depth,table,x,y,z
52264,-0.481324,-0.178813,-0.200809,-0.340344,-0.360992,-0.372654
21073,0.761982,-0.178813,-1.093828,0.906871,0.932443,0.906552
42161,-0.607762,1.005608,0.245701,-0.607605,-0.56933,-0.487639
35974,-0.797419,-0.80586,-0.647318,-0.803596,-0.751626,-0.861339
7641,0.003355,0.587577,0.245701,0.140725,0.168535,0.231016


In [43]:
X_test_cat_le = pd.DataFrame(index = X_test_cat.index)

X_test_cat_le.head()

52264
21073
42161
35974
7641


In [44]:
X_test_cat_le['cut'] = X_test_cat['cut'].apply(lambda x : cut_encoder[x])

X_test_cat_le['color'] = X_test_cat['color'].apply(lambda x : color_encoder[x])

X_test_cat_le['clarity'] = X_test_cat['clarity'].apply(lambda x : clarity_encoder[x])

X_test_cat_le.head()

Unnamed: 0,cut,color,clarity
52264,4,6,4
21073,4,4,5
42161,4,4,3
35974,4,5,5
7641,5,4,8


In [45]:
X_test_transformed = pd.concat([X_test_num_rescaled, X_test_cat_le], axis=1)

X_test_transformed.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
52264,-0.481324,-0.178813,-0.200809,-0.340344,-0.360992,-0.372654,4,6,4
21073,0.761982,-0.178813,-1.093828,0.906871,0.932443,0.906552,4,4,5
42161,-0.607762,1.005608,0.245701,-0.607605,-0.56933,-0.487639,4,4,3
35974,-0.797419,-0.80586,-0.647318,-0.803596,-0.751626,-0.861339,4,5,5
7641,0.003355,0.587577,0.245701,0.140725,0.168535,0.231016,5,4,8


### Without using sklearn

In [46]:
%%time
final_val=[]
k=10
for i in range(0,X_test_transformed.shape[0]):
    dist=np.power(np.sum((np.power((X_train_transformed.values - X_test_transformed.values[i]),2)),axis=1),1/2)
    ind=X_train_transformed.to_records().index
    knn=pd.DataFrame({"Distance":dist},index=ind)
    inde=knn.sort_values(by=["Distance"])[:k]
    sum=0
    for i in inde.index:
        sum+=y_train[i]
    print(sum/k)
    final_val.append(sum/k)

ind_test=X_test_transformed.to_records().index
pred_test=pd.Series(data=final_val,index=ind_test)

1800.3
8575.9
1199.5
1030.6
6036.6
5275.7
798.0
1877.6
4701.2
643.6
5508.6
3235.2
738.0
1743.3
2488.6
8375.5
973.0
2175.2
713.7
743.1
1997.3
709.3
1232.3
6628.0
13000.3
6806.0
2937.0
5364.7
7229.2
479.2
10346.7
4051.2
9221.6
702.7
4267.1
589.7
6609.2
5581.7
556.4
984.7
1522.7
2099.2
860.3
6257.9
738.7
750.8
2112.6
8853.2
1773.7
925.3
743.0
801.5
3446.9
5155.6
1303.2
8181.9
5381.0
15764.6
664.9
2128.5
1288.9
8448.1
964.9
4614.6
6784.0
1378.3
740.8
5363.0
2667.1
4413.5
971.7
9286.8
2331.5
1691.4
1092.8
3995.3
926.6
4635.9
737.3
2049.4
14020.8
759.2
818.4
1385.8
2688.8
814.7
2265.6
14208.6
4152.4
4475.9
4571.1
4086.7
10840.4
7384.1
3753.7
12088.8
625.1
6655.1
1290.3
2385.2
2196.7
2213.5
15384.9
775.9
1047.6
2310.3
2332.3
2299.5
4556.7
4296.9
4853.3
5842.5
11453.3
899.0
9341.7
801.0
4160.8
5874.2
10425.0
1156.3
1015.4
1159.3
8944.1
6121.5
1789.4
695.2
3828.9
761.1
2526.0
7445.5
12385.2
4924.4
10742.2
600.9
2757.0
5188.2
4858.4
7786.9
3409.6
1713.8
778.0
1555.9
1229.7
891.5
6413.7
1771.4
51

In [47]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': pred_test})

temp_df.head()

Unnamed: 0,Actual,Predicted
52264,2491,1800.3
21073,9248,8575.9
42161,1284,1199.5
35974,921,1030.6
7641,4268,6036.6


In [48]:
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, pred_test))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, pred_test))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, pred_test)))

Mean Absolute Error:  379.84436781609185
Mean Squared Error:  555349.8456025213
Root Mean Squared Error:  745.2179852919019


### With using SKLearn

In [49]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor()
regressor.fit(X_train_transformed, y_train)

In [50]:
y_test_pred = regressor.predict(X_test_transformed)

In [51]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
52264,2491,1779.6
21073,9248,9000.6
42161,1284,1136.6
35974,921,960.4
7641,4268,5510.6


In [52]:
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  382.1343270300334
Mean Squared Error:  563397.9181401556
Root Mean Squared Error:  750.5983733929588
