# Diamonds price prediction

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

**Content:**
- **price** - price in US dollars (326-18,823) - **TARGER VALUE**
- **carat** - weight of the diamond (0.2--5.01)
- **cut** - quality of the cut (Fair, Good, Very Good, Premium, Ideal)
- **color** - diamond colour, from J (worst) to D (best)
- **clarity** - a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))
- **x** - length in mm (0--10.74)
- **y** - width in mm (0--58.9)
- **z** - depth in mm (0--31.8)
- **depth** - total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)
- **table** - width of top of diamond relative to widest point (43--95)

</br>

Target: predict **price**

</br>

Source: https://www.kaggle.com/shivam2503/diamonds

Read data:

In [51]:
data = pd.read_csv('diamonds.csv')

### Data preparation

Drop useless column:

In [52]:
data.drop(data.columns[0], axis=1, inplace=True)

Convert '0' values in length, width or depth into NaN:

In [53]:
data[['x','y','z']] = data[['x','y','z']].replace(0,np.NaN)

Rename columns:

In [54]:
data.rename(columns={'x': 'length', 'y': 'width', 'z': 'depth'}, inplace=True)

Looking for NULL data:

In [55]:
data.isnull().any()

carat      False
cut        False
color      False
clarity    False
depth      False
table      False
price      False
length      True
width       True
depth       True
dtype: bool

Delete NULL data rows:

In [56]:
data.dropna(axis=0, inplace=True)

Check again:

In [57]:
data.isnull().any()

carat      False
cut        False
color      False
clarity    False
depth      False
table      False
price      False
length     False
width      False
depth      False
dtype: bool

Reset indexes:

In [58]:
data.reset_index(drop=True, inplace=True)

Print data:

In [59]:
data

Unnamed: 0,carat,cut,color,clarity,depth,table,price,length,width,depth.1
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53915,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53916,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53917,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53918,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


## 1. EDA

**Data Description**

In [60]:
data.describe()[1:].T

Unnamed: 0,mean,std,min,25%,50%,75%,max
carat,0.797698,0.473795,0.2,0.4,0.7,1.04,5.01
depth,61.749514,1.432331,43.0,61.0,61.8,62.5,79.0
table,57.456834,2.234064,43.0,56.0,57.0,59.0,95.0
price,3930.993231,3987.280446,326.0,949.0,2401.0,5323.25,18823.0
length,5.731627,1.119423,3.73,4.71,5.7,6.54,10.74
width,5.734887,1.140126,3.68,4.72,5.71,6.54,58.9
depth,3.540046,0.70253,1.07,2.91,3.53,4.04,31.8


**Explore average price by cut and color**

In [61]:
data.pivot_table(values='price', 
                 index='color', 
                 columns='cut', aggfunc=np.mean)

cut,Fair,Good,Ideal,Premium,Very Good
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
D,4291.06135,3405.382175,2629.094566,3623.76779,3470.467284
E,3682.3125,3423.644159,2597.55009,3538.91442,3214.652083
F,3827.003205,3498.761852,3374.526536,4325.099571,3778.82024
G,4232.412141,4105.90794,3718.46907,4502.207806,3872.753806
H,5135.683168,4276.254986,3889.334831,5198.35414,4535.059243
I,4685.445714,5078.532567,4451.970377,5939.557814,5255.879568
J,4975.655462,4574.172638,4918.186384,6294.591584,5103.513274


**Explore max and min price by cut and color**

In [62]:
data.pivot_table(values='price', 
                 index='color', 
                 columns='cut', aggfunc=[np.min, np.max])

Unnamed: 0_level_0,amin,amin,amin,amin,amin,amax,amax,amax,amax,amax
cut,Fair,Good,Ideal,Premium,Very Good,Fair,Good,Ideal,Premium,Very Good
color,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
D,536,361,367,367,357,16386,18468,18693,18575,18542
E,337,327,326,326,352,15584,18236,18729,18477,18731
F,496,357,408,342,357,17995,18686,18780,18791,18777
G,369,394,361,382,354,18574,18625,18806,18741,18818
H,659,368,357,368,337,18565,18640,18760,18795,18803
I,735,351,348,334,336,18242,18707,18779,18823,18500
J,416,335,340,363,336,18531,18325,18508,18710,18430


**Count values of specified cut and color**

In [72]:
data.groupby(['cut', 'color'])['price'].count()

cut        color
Fair       D         163
           E         224
           F         312
           G         313
           H         303
           I         175
           J         119
Good       D         662
           E         933
           F         907
           G         869
           H         702
           I         522
           J         307
Ideal      D        2834
           E        3903
           F        3825
           G        4882
           H        3115
           I        2093
           J         896
Premium    D        1602
           E        2337
           F        2330
           G        2921
           H        2355
           I        1427
           J         808
Very Good  D        1513
           E        2400
           F        2164
           G        2299
           H        1823
           I        1204
           J         678
Name: price, dtype: int64

## 2. Visualization

## 3. Prediction

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53920 entries, 0 to 53919
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53920 non-null  float64
 1   cut      53920 non-null  object 
 2   color    53920 non-null  object 
 3   clarity  53920 non-null  object 
 4   depth    53920 non-null  float64
 5   table    53920 non-null  float64
 6   price    53920 non-null  int64  
 7   length   53920 non-null  float64
 8   width    53920 non-null  float64
 9   depth    53920 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


### Preprocessing

#### Split data into X and Y

In [13]:
data_X = data.drop('price', axis=1)
data_Y = data.price

#### Convert categorical data to numerics

In [14]:
label_encoder = LabelEncoder()

In [15]:
data_X.cut = label_encoder.fit_transform(data_X.cut)
cut_map = {index: label for index, label in enumerate(label_encoder.classes_)}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [16]:
data_X.color = label_encoder.fit_transform(data_X.color)
color_map = {index: label for index, label in enumerate(label_encoder.classes_)}

In [17]:
data_X.clarity = label_encoder.fit_transform(data_X.clarity)
clarity_map = {index: label for index, label in enumerate(label_encoder.classes_)}

In [18]:
print(cut_map, color_map, clarity_map, sep='\n\n')

{0: 'Fair', 1: 'Good', 2: 'Ideal', 3: 'Premium', 4: 'Very Good'}

{0: 'D', 1: 'E', 2: 'F', 3: 'G', 4: 'H', 5: 'I', 6: 'J'}

{0: 'I1', 1: 'IF', 2: 'SI1', 3: 'SI2', 4: 'VS1', 5: 'VS2', 6: 'VVS1', 7: 'VVS2'}


In [19]:
data_X

Unnamed: 0,carat,cut,color,clarity,depth,table,length,width,depth.1
0,0.23,2,1,3,61.5,55.0,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,4.20,4.23,2.63
4,0.31,1,6,3,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
53915,0.72,2,0,2,60.8,57.0,5.75,5.76,3.50
53916,0.72,1,0,2,63.1,55.0,5.69,5.75,3.61
53917,0.70,4,0,2,62.8,60.0,5.66,5.68,3.56
53918,0.86,3,4,3,61.0,58.0,6.15,6.12,3.74


#### Scale data between 0 and 1

In [20]:
scaler = MinMaxScaler()

In [21]:
data_X = scaler.fit_transform(data_X)

In [22]:
pd.DataFrame(data_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.006237,0.50,0.166667,0.428571,0.513889,0.230769,0.031384,0.005433,0.044256
1,0.002079,0.75,0.166667,0.285714,0.466667,0.346154,0.022825,0.002898,0.040351
2,0.006237,0.25,0.166667,0.571429,0.386111,0.423077,0.045649,0.007063,0.040351
3,0.018711,0.75,0.833333,0.714286,0.538889,0.288462,0.067047,0.009960,0.050765
4,0.022869,0.25,1.000000,0.428571,0.563889,0.288462,0.087019,0.012133,0.054670
...,...,...,...,...,...,...,...,...,...
53915,0.108108,0.50,0.000000,0.285714,0.494444,0.269231,0.288160,0.037668,0.079076
53916,0.108108,0.25,0.000000,0.285714,0.558333,0.230769,0.279601,0.037486,0.082655
53917,0.103950,1.00,0.000000,0.285714,0.550000,0.326923,0.275321,0.036219,0.081028
53918,0.137214,0.75,0.666667,0.428571,0.500000,0.288462,0.345221,0.044187,0.086886


#### Split data to train and test data

In [23]:
X_train, X_test, y_train, y_test = train_test_split(data_X, 
                                                    data_Y, 
                                                    test_size=0.2)

In [24]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(43136, 9) (10784, 9) (43136,) (10784,)


### Train a model

#### Linear regression

In [25]:
lin_model = LinearRegression()

In [26]:
lin_model.fit(X_train, y_train)

LinearRegression()

In [27]:
print(f'Score without regularization equals {lin_model.score(X_test, y_test)}')

Score without regularization equals 0.8711802404122251


#### L1 and L2 regularization (Lasso and Ridge Regression)

In [28]:
l1_model = Lasso(alpha=1)
l2_model = Ridge(alpha=1)

In [29]:
l1_model.fit(X_train, y_train)
l2_model.fit(X_train, y_train)

Ridge(alpha=1)

In [30]:
print(f'Score with Lasso regularization equals {l1_model.score(X_test, y_test)}')
print(f'Score with Ridge regularization equals {l2_model.score(X_test, y_test)}')

Score with Lasso regularization equals 0.8786569229499357
Score with Ridge regularization equals 0.8786160417219371


In [31]:
l2_model = Ridge(alpha=0.5)
l2_model.fit(X_train, y_train)
print(f'Score with Ridge regularization equals {l2_model.score(X_test, y_test)}')

Score with Ridge regularization equals 0.8783922318198165


In [32]:
l2_pred = pd.DataFrame(l2_model.predict(X_test))
y_test.reset_index(drop=True, inplace=True)
l2_pred['Y test'] = y_test
l2_pred['diff'] = abs(l2_pred[0] - l2_pred['Y test'])
l2_pred.columns = ['Predicted', 'Expected', 'Difference']


In [33]:
l2_pred[:10]

Unnamed: 0,Predicted,Expected,Difference
0,-154.0619,605,759.0619
1,864.340104,781,83.340104
2,1185.619222,1105,80.619222
3,1420.923269,800,620.923269
4,5570.251925,5331,239.251925
5,2045.978957,3785,1739.021043
6,6574.085819,6810,235.914181
7,445.999561,776,330.000439
8,9858.190124,11190,1331.809876
9,1932.145066,1115,817.145066


In [34]:
print(f'Average error equals {l2_pred.Difference.mean()}')

Average error equals 859.2905878988761
