In [1]:
# packages for data prep
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, normalize, StandardScaler

# packages for predictions
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# package for time keeping
from tqdm import tqdm

# calculating the accuracies
from sklearn.metrics import mean_squared_error



In [2]:
# Reding the csv file
diamonds_df = pd.read_csv("diamonds.csv")

diamonds_df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
# Changing the name of the columns
diamonds_df.columns = ['index', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z']

In [4]:
# Setting the index
diamonds_df.set_index("index", inplace=True)


In [5]:
diamonds_df.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,price,x,y,z
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [6]:
# creating an ordinal encoder
ord_enc = OrdinalEncoder(categories=[['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'],
                                      ['J','I','H','G','F','E','D'],
                                      ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']])

In [7]:
# encoding the text columns
diamonds_df[['cut','color','clarity']] = ord_enc.fit_transform(diamonds_df[['cut','color','clarity']])

In [8]:
diamonds_df[['carat', 'depth', 'table']] = normalize(diamonds_df[['carat', 'depth', 'table']], norm='l2')

In [9]:
diamonds_df.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,price,x,y,z
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.002788,4.0,5.0,1.0,0.745397,0.666615,326,3.95,3.98,2.43
2,0.002458,3.0,5.0,2.0,0.700046,0.714094,326,3.89,3.84,2.31
3,0.002662,1.0,5.0,4.0,0.658666,0.752431,327,4.05,4.07,2.31
4,0.003404,3.0,1.0,3.0,0.732455,0.680807,334,4.2,4.23,2.63
5,0.003611,1.0,0.0,1.0,0.737294,0.675562,335,4.34,4.35,2.75


In [10]:
# checking the encoding
diamonds_df.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,price,x,y,z
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.002788,4.0,5.0,1.0,0.745397,0.666615,326,3.95,3.98,2.43
2,0.002458,3.0,5.0,2.0,0.700046,0.714094,326,3.89,3.84,2.31
3,0.002662,1.0,5.0,4.0,0.658666,0.752431,327,4.05,4.07,2.31
4,0.003404,3.0,1.0,3.0,0.732455,0.680807,334,4.2,4.23,2.63
5,0.003611,1.0,0.0,1.0,0.737294,0.675562,335,4.34,4.35,2.75


In [31]:
# columns
cols = list(diamonds_df.columns)

# input and output
y = diamonds_df['price']
cols.remove('price')
cols.remove('x')
cols.remove('y')
cols.remove('z')
X = diamonds_df[cols]

In [12]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
X_train.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
19498,0.014454,4.0,2.0,5.0,0.732249,0.680884
31230,0.00371,4.0,5.0,3.0,0.742098,0.670282
22312,0.014316,4.0,5.0,4.0,0.738256,0.674369
279,0.00972,4.0,4.0,1.0,0.751202,0.660001
6647,0.009481,4.0,1.0,5.0,0.74045,0.672045


<h3><b>Decision Tree Regressor</b></h3>

In [14]:
# decision tree

# defining the decision tree
dec_reg = DecisionTreeRegressor(max_depth=15)

In [15]:
# training the regressor
dec_reg.fit(X_train, y_train)

In [16]:
y_1 = dec_reg.predict(X_train)
y_2 = dec_reg.predict(X_test)

In [17]:
# mean squared error
print("mean squared error for train data: ",mean_squared_error(y_true=y_train, y_pred=y_1))

print("mean squared error for test data: ",mean_squared_error(y_true=y_test, y_pred=y_2))

mean squared error for train data:  78593.77159222092
mean squared error for test data:  462077.8284258269


<h3><b>Random Forest Regression</b></h3>

In [18]:
# random forest
forest_reg = RandomForestRegressor(n_estimators=250)

In [19]:
# training random forest
forest_reg.fit(X_train, y_train)

In [20]:
yf_1 = forest_reg.predict(X_train)
yf_2 = forest_reg.predict(X_test)

In [21]:
# mean squared error
print("mean squared error for train data: ",mean_squared_error(y_true=y_train, y_pred=yf_1))

print("mean squared error for test data: ",mean_squared_error(y_true=y_test, y_pred=yf_2))

mean squared error for train data:  45635.529471460315
mean squared error for test data:  305741.3968840369


<h3><b>Support Vector Regression</b></h3>

In [22]:
# support vector regression
# scaling
X_scaler = StandardScaler()
y_scaler = StandardScaler()

X = X_scaler.fit_transform(X)
y = y_scaler.fit_transform(y.values.reshape(-1,1))

In [23]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [24]:
# creating svr regressor
svr_reg = SVR(kernel='rbf')

In [25]:
# training the svr_reg
svr_reg.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [26]:
ys_1 = svr_reg.predict(X_train)
ys_2 = svr_reg.predict(X_test)

In [27]:
# mean squared error
print("mean squared error for train data: ",mean_squared_error(y_true=y_train, y_pred=ys_1))

print("mean squared error for test data: ",mean_squared_error(y_true=y_test, y_pred=ys_2))

mean squared error for train data:  0.021932206908570506
mean squared error for test data:  0.02232603385503696


In [28]:
#    from testing between SVR, Decision Tree and Random forest model, the SVR achieved the best results
# by a huge margin.

<h3><b>Making a pipeline</b></h3>

In [29]:
# importing package for pipeline creation
from sklearn.pipeline import Pipeline

In [30]:
# creating a pipeline for SVR
pipe = Pipeline([('scaler', StandardScaler()), ('sv_reg', SVR(kernel='rbf'))])

In [None]:
# splitting the data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [32]:
# training the pipeline
pipe.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [33]:
pipe.score(X_train, y_train)

0.9782409674488787

In [34]:
pipe.score(X_test, y_test)

0.9772052245539022