# **Diamond Price Prediction** 

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('diamonds.csv')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df = df [['carat','cut','color','clarity','x','y','z','depth','table','price']]

In [4]:
df.shape

(53940, 10)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   x        53940 non-null  float64
 5   y        53940 non-null  float64
 6   z        53940 non-null  float64
 7   depth    53940 non-null  float64
 8   table    53940 non-null  float64
 9   price    53940 non-null  int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [6]:
df.describe().round(3)

Unnamed: 0,carat,x,y,z,depth,table,price
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.798,5.731,5.735,3.539,61.749,57.457,3932.8
std,0.474,1.122,1.142,0.706,1.433,2.234,3989.44
min,0.2,0.0,0.0,0.0,43.0,43.0,326.0
25%,0.4,4.71,4.72,2.91,61.0,56.0,950.0
50%,0.7,5.7,5.71,3.53,61.8,57.0,2401.0
75%,1.04,6.54,6.54,4.04,62.5,59.0,5324.25
max,5.01,10.74,58.9,31.8,79.0,95.0,18823.0


In [7]:
#Dropping the outliers. 
df = df[(df["depth"]<75)&(df["depth"]>45)]
df = df[(df["table"]<80)&(df["table"]>40)]
df = df[(df["x"]<30)]
df = df[(df["y"]<30)]
df = df[(df["z"]<30)&(df["z"]>2)]

In [8]:
#Function to change clarity
def clarity_fn(clarity):
    if clarity=='I1':
        return 0.0
    elif clarity=='SI2':
        return 1.0
    elif clarity=='SI1':
        return 2.0
    elif clarity=='VS2':
        return 3.0
    elif clarity=='VS1':
        return 4.0
    elif clarity=='VVS2':
        return 5.0
    elif clarity=='VVS1':
        return 6.0  
    elif clarity=='IF':
        return 7.0
df['clarity']=df['clarity'].apply(clarity_fn)

#Function to change cut quality
def cut_qlty(cut):
    if cut=='Fair':
        return 0.0
    elif cut=='Good':
        return 1.0
    elif cut=='Ideal':
        return 2.0
    elif cut =='Premium':
        return 3.0
    elif cut =='Very Good':
        return 4.0
df['cut']=df['cut'].apply(cut_qlty)

#Function to change cut quality
def color_fn(color):
    if color=='D':
        return 0.0
    elif color=='E':
        return 1.0
    elif color=='F':
        return 2.0
    elif color =='G':
        return 3.0
    elif color =='H':
        return 4.0
    elif color =='I':
        return 5.0
    elif color =='J':
        return 6.0
df['color']=df['color'].apply(color_fn)

In [9]:
df.head()

Unnamed: 0,carat,cut,color,clarity,x,y,z,depth,table,price
0,0.23,2.0,1.0,1.0,3.95,3.98,2.43,61.5,55.0,326
1,0.21,3.0,1.0,2.0,3.89,3.84,2.31,59.8,61.0,326
2,0.23,1.0,1.0,4.0,4.05,4.07,2.31,56.9,65.0,327
3,0.29,3.0,5.0,3.0,4.2,4.23,2.63,62.4,58.0,334
4,0.31,1.0,6.0,1.0,4.34,4.35,2.75,63.3,58.0,335


In [10]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse,r2_score
from sklearn import metrics

## **MODEL BUILDING**

In [11]:
X = df.drop(['price'],axis = 1)
y = df['price']

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y, train_size = 0.75, random_state = 0)

**Feature Transformation on Train data**

In [13]:
# Standard Scaler for Numerical Features
scaler = StandardScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train),columns = x_train.columns, index = x_train.index)
x_train.head()

Unnamed: 0,carat,cut,color,clarity,x,y,z,depth,table
40272,-1.196848,-0.538618,-0.348023,0.573992,-1.600088,-1.595362,-1.590144,0.102661,-0.653988
44865,-0.626274,1.406218,-1.523658,-0.032523,-0.526609,-0.51397,-0.561769,-0.389715,-0.653988
24022,2.754906,1.406218,2.003247,-1.245554,2.183925,2.252591,2.233673,0.173,2.03783
33311,-0.94326,-0.538618,-1.523658,-0.639039,-1.045457,-1.090712,-1.025262,0.384019,-1.102625
19209,0.874125,-0.538618,0.827612,1.180507,1.047827,1.081083,0.930099,-0.95243,-0.205352


**Feature Transformation on Test data**

In [14]:
# Standard Scaler for Numerical Features
x_test= pd.DataFrame(scaler.transform(x_test),columns = x_test.columns,index = x_test.index)
x_test.head()

Unnamed: 0,carat,cut,color,clarity,x,y,z,depth,table
11576,0.979787,0.4338,2.003247,-1.245554,1.092555,1.054048,1.161845,0.665376,0.243285
45207,-0.605142,-0.538618,0.239794,0.573992,-0.526609,-0.504958,-0.518317,-0.038018,-1.102625
52940,-0.203627,0.4338,-0.935841,-0.639039,-0.043544,-0.081413,-0.069308,-0.038018,2.03783
212,-0.182494,-0.538618,-0.348023,-0.032523,-0.043544,-0.009321,0.003113,0.24334,-1.551261
983,-0.119097,1.406218,0.239794,0.573992,0.108533,0.143877,-0.04034,-1.374466,0.243285


In [15]:
# Building pipelins of standard scaler and model for various regressors.
lr = LinearRegression()
pipeline_lr=Pipeline([("scalar",StandardScaler()),
                     ("lr",LinearRegression())])

dt = DecisionTreeRegressor()
pipeline_dt=Pipeline([("scalar",StandardScaler()),
                     ("dt",DecisionTreeRegressor())])

rf = RandomForestRegressor()
pipeline_rf=Pipeline([("scalar",StandardScaler()),
                     ("rf",RandomForestRegressor())])

knn = KNeighborsRegressor()
pipeline_kn=Pipeline([("scalar",StandardScaler()),
                     ("knn",KNeighborsRegressor())])

xgb = XGBRegressor()
pipeline_xgb=Pipeline([("scalar",StandardScaler()),
                     ("xgb",XGBRegressor())])

# List of all the pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_rf, pipeline_kn, pipeline_xgb]

In [16]:
# Dictionary of pipelines and model types for ease of reference
pipe_dict = {0: "LinearRegression", 1: "DecisionTree", 2: "RandomForest",3: "KNNeighbors", 4: "XGBRegressor"}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(x_train, y_train)
    
cv_results_rms = []
for i, model in enumerate(pipelines):
    cv_score = cross_val_score(model, x_train,y_train,scoring="neg_root_mean_squared_error", cv=10)
    cv_results_rms.append(cv_score)
    print("%s: %f " % (pipe_dict[i], cv_score.mean()))
    

LinearRegression: -1216.324841 
DecisionTree: -739.146037 
RandomForest: -539.965047 
KNNeighbors: -727.112291 
XGBRegressor: -541.927121 


From the above scores, XGB Regressor appears to be the model with the best score on the negative mean squared error. Let’s test this model on the test set and evaluate with different parameters:



In [17]:
from sklearn.metrics import mean_squared_error as mse, r2_score

pred = pipeline_xgb.predict(x_test)
XGB=pd.DataFrame({"Model":["XGB Regressor"], "MSE":[(mse(pred,y_test)).round(2)], 
                      "RMSE":[(np.sqrt(mse(pred,y_test)).round(2))], "R2 Score":[((r2_score(pred, y_test))*100).round(3)]})

pred = pipeline_lr.predict(x_test)
LR=pd.DataFrame({"Model":["Linear Regression"], "MSE":[(mse(pred,y_test)).round(2)], 
                      "RMSE":[(np.sqrt(mse(pred,y_test)).round(2))], "R2 Score":[((r2_score(pred, y_test))*100).round(3)]})

pred = pipeline_dt.predict(x_test)
DT=pd.DataFrame({"Model":["Decision Tree"], "MSE":[(mse(pred,y_test)).round(2)], 
                      "RMSE":[(np.sqrt(mse(pred,y_test)).round(2))], "R2 Score":[((r2_score(pred, y_test))*100).round(3)]})

pred = pipeline_rf.predict(x_test)
RF=pd.DataFrame({"Model":["Random Forest"], "MSE":[(mse(pred,y_test)).round(2)], 
                      "RMSE":[(np.sqrt(mse(pred,y_test)).round(2))], "R2 Score":[((r2_score(pred, y_test))*100).round(3)]})

pred = pipeline_lr.predict(x_test)
KNN=pd.DataFrame({"Model":["KNN Regressor"], "MSE":[(mse(pred,y_test)).round(2)], 
                      "RMSE":[(np.sqrt(mse(pred,y_test)).round(2))], "R2 Score":[((r2_score(pred, y_test))*100).round(3)]})
results = pd.concat([XGB,RF,DT,LR,KNN])
results


Unnamed: 0,Model,MSE,RMSE,R2 Score
0,XGB Regressor,310962.49,557.64,98.027
0,Random Forest,306284.44,553.43,98.047
0,Decision Tree,562748.68,750.17,96.447
0,Linear Regression,1437335.95,1198.89,90.021
0,KNN Regressor,1437335.95,1198.89,90.021


## **Model Serialization**

In [18]:
from pickle import dump
dump(scaler, open('models/standard_scaler.pkl', 'wb'))
dump(lr, open('models/lr_model.pkl', 'wb'))
dump(dt, open('models/dt_model.pkl', 'wb'))
dump(rf, open('models/rf_model.pkl', 'wb'))
dump(knn, open('models/knn_model.pkl', 'wb'))
dump(xgb, open('models/xgb_model.pkl', 'wb'))

In [19]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'x', 'y', 'z', 'depth', 'table',
       'price'],
      dtype='object')

In [20]:
# Exporting the model
import pickle
pickle.dump(xgb, open('diamond.pkl', 'wb'))