In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
def model_performance(prediction):
    mse = mean_squared_error(y_test, prediction)
    rmse = np.sqrt(mse)
    r2 = np.sqrt(r2_score((y_test),(prediction)))
    print(f'RMSE: {rmse}')
    print(f'R2: {r2}')

In [3]:
diamonds = pd.read_csv('diamonds_renamed_cols_removed_nulls.csv')
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,lenght_x,width_y,depth_z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


### Remove 'depth' and 'table' as they are not going to very useful in estimating diamonds price

In [4]:
diamonds.drop(['depth', 'table'], axis=1, inplace=True)

In [5]:
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,price,lenght_x,width_y,depth_z
0,0.23,Ideal,E,SI2,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,335,4.34,4.35,2.75


### Remove outliers from 'width_y' and 'depth_z'

In [6]:
outliers = diamonds[(diamonds['width_y'] > 30) | (diamonds['depth_z'] > 30)]
outliers.shape

(3, 8)

In [7]:
diamonds.drop(outliers.index, inplace=True)
diamonds.shape

(53917, 8)

In [60]:
diamonds.columns

Index(['carat', 'cut', 'color', 'clarity', 'price', 'lenght_x', 'width_y',
       'depth_z'],
      dtype='object')

In [67]:
dimensions = diamonds[['lenght_x', 'width_y',
       'depth_z']]

In [70]:
diamonds.drop(dimensions, inplace=True, axis=1)

### split dataset to train and test set

Create X and y

In [72]:
X = diamonds.drop('price', axis=1)
y = diamonds['price']

In [73]:
X.shape, y.shape

((53917, 4), (53917,))

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [75]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity
31746,0.35,Ideal,H,VVS1
20164,1.02,Very Good,F,VVS2
24521,1.52,Premium,D,SI1
42436,0.5,Ideal,E,SI1
10254,1.01,Premium,E,SI2


In [76]:
X_train.shape, y_train.shape

((43133, 4), (43133,))

### run Ordinal encoder on cut, color, clarity

In [77]:
categories = [['Fair', 'Good', 'Very Good', 'Ideal', 'Premium'],
             ['D', 'E', 'F', 'G', 'H', 'I', 'J'],
             ['I1','SI2', 'SI1', 'VS2',  'VS1', 'VVS2', 'VVS1', 'IF']]

In [78]:
obj_cols = ['cut', 'color', 'clarity']

In [79]:
ordinal_encoder = OrdinalEncoder(categories=categories)

In [80]:
X_train_cat = X_train[obj_cols]
X_test_cat = X_test[obj_cols]

In [81]:
ordinal_encoder.fit(X_train_cat)
X_train_oe = pd.DataFrame(ordinal_encoder.transform(X_train_cat))
X_test_oe = pd.DataFrame(ordinal_encoder.transform(X_test_cat))

In [82]:
X_train_oe.index = X_train.index
X_train_oe.columns = X_train_cat.columns

X_test_oe.index = X_test.index
X_test_oe.columns = X_test_cat.columns

### Normalise numerical variables with StandardScaler

In [83]:
num_X_train = X_train.drop(obj_cols, axis=1)
num_X_test = X_test.drop(obj_cols, axis=1)

In [84]:
scaler = StandardScaler()
num_X_train_scaled = pd.DataFrame(scaler.fit_transform(num_X_train))
num_X_train_scaled.columns = num_X_train.columns
num_X_train_scaled.index = num_X_train.index

num_X_test_scaled = pd.DataFrame(scaler.transform(num_X_test))
num_X_test_scaled.columns = num_X_test.columns
num_X_test_scaled.index = num_X_test.index

### Merge categorical and numerical variables

In [85]:
X_train = pd.concat([num_X_train_scaled, X_train_oe], axis=1)
X_test = pd.concat([num_X_test_scaled, X_test_oe], axis=1)

## Models

### Linear Regression

In [86]:
lm = LinearRegression()
lm_model = lm.fit(X_train, y_train)
y_pred_lm = lm_model.predict(X_test)

In [87]:
model_performance(y_pred_lm)

RMSE: 1199.195242716034
R2: 0.9523037641984099


### Polynomial regression

In [88]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3)


In [89]:
X_train_poly = poly.fit_transform(X_train)
print(X_train_poly.shape)

(43133, 35)


In [90]:
pr = LinearRegression()

In [91]:
pr.fit(X_train_poly, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [92]:
X_test_poly = poly.fit_transform(X_test)

In [93]:
y_pred_pr = pr.predict(X_test_poly)

In [94]:
model_performance(y_pred_pr)

RMSE: 650.6561195344484
R2: 0.9861983143372905


### Random Forest Regressor

In [107]:
rr  = RandomForestRegressor()
rr.fit(X_train,y_train)
y_pred = rr.predict(X_test)

In [108]:
rr_mse = mean_squared_error(y_test, y_pred)
rr_rmse = np.sqrt(rr_mse)
rr_r2 = np.sqrt(r2_score((y_test),(y_pred)))

In [109]:
print(f'RMSE: {rr_rmse}')
print(f'R2: {rr_r2}')

RMSE: 566.1974646874742
R2: 0.9895665226038675


### Batch gradient descent

In [104]:
from sklearn.linear_model import SGDRegressor
sgd_reg  = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.01)
sgd_reg.fit(X_train,y_train)
y_pred = sgd_reg.predict(X_test)

In [105]:
sgd_mse = mean_squared_error(y_test, y_pred)
sgd_rmse = np.sqrt(sgd_mse)
sgd_r2 = np.sqrt(r2_score((y_test),(y_pred)))

In [106]:
print(f'RMSE: {sgd_rmse}')
print(f'R2: {sgd_r2}')

RMSE: 1199.1592799025793
R2: 0.9523066965266234
