In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [32]:
def model_performance(prediction):
    """Function calculating RMSE and R2"""
    mse = mean_squared_error(y_train, prediction)
    rmse = np.sqrt(mse)
    r2 = np.sqrt(r2_score((y_train),(prediction)))
    print(f'RMSE: {rmse}')
    print(f'R2: {r2}')

In [33]:
def cross_validation_scores(model):
    scores = cross_val_score(lm_model, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
    scores = np.sqrt(-scores)
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("SD:", scores.std())

### Import the dataset saved after EDA

In [34]:
diamonds = pd.read_csv('diamonds_renamed_cols_removed_nulls.csv')
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,lenght_x,width_y,depth_z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


### Remove 'depth' and 'table' as they are not going to very useful in estimating diamonds price

In [35]:
diamonds.drop(['depth', 'table'], axis=1, inplace=True)

In [36]:
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,price,lenght_x,width_y,depth_z
0,0.23,Ideal,E,SI2,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,335,4.34,4.35,2.75


### Remove outliers from 'width_y' and 'depth_z'

In [37]:
outliers = diamonds[(diamonds['width_y'] > 30) | (diamonds['depth_z'] > 30)]
outliers.shape

(3, 8)

In [38]:
diamonds.drop(outliers.index, inplace=True)
diamonds.shape

(53917, 8)

In [39]:
diamonds.columns

Index(['carat', 'cut', 'color', 'clarity', 'price', 'lenght_x', 'width_y',
       'depth_z'],
      dtype='object')

In [40]:
dimensions = diamonds[['lenght_x', 'width_y',
       'depth_z']]

In [41]:
diamonds.drop(dimensions, inplace=True, axis=1)

### split dataset to train and test set

Create X and y

In [42]:
X = diamonds.drop('price', axis=1)
y = diamonds['price']

In [43]:
X.shape, y.shape

((53917, 4), (53917,))

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [45]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity
31746,0.35,Ideal,H,VVS1
20164,1.02,Very Good,F,VVS2
24521,1.52,Premium,D,SI1
42436,0.5,Ideal,E,SI1
10254,1.01,Premium,E,SI2


In [46]:
X_train.shape, y_train.shape

((43133, 4), (43133,))

### run Ordinal encoder on cut, color, clarity

In [47]:
categories = [['Fair', 'Good', 'Very Good', 'Ideal', 'Premium'],
             ['D', 'E', 'F', 'G', 'H', 'I', 'J'],
             ['I1','SI2', 'SI1', 'VS2',  'VS1', 'VVS2', 'VVS1', 'IF']]

In [48]:
obj_cols = ['cut', 'color', 'clarity']

In [49]:
ordinal_encoder = OrdinalEncoder(categories=categories)

In [50]:
X_train_cat = X_train[obj_cols]
X_test_cat = X_test[obj_cols]

In [51]:
ordinal_encoder.fit(X_train_cat)
X_train_oe = pd.DataFrame(ordinal_encoder.transform(X_train_cat))
X_test_oe = pd.DataFrame(ordinal_encoder.transform(X_test_cat))

In [52]:
X_train_oe.index = X_train.index
X_train_oe.columns = X_train_cat.columns

X_test_oe.index = X_test.index
X_test_oe.columns = X_test_cat.columns

### Normalise numerical variables with StandardScaler

In [53]:
num_X_train = X_train.drop(obj_cols, axis=1)
num_X_test = X_test.drop(obj_cols, axis=1)

from sklearn.ensemble import IsolationForest
iso = IsolationForest(contamination=0.05)
yhat = iso.fit_predict(num_X_train)

num_X_train['anomaly'] = yhat

outliers = num_X_train.loc[num_X_train['anomaly'] == -1]

num_X_train['anomaly'].value_counts()

In [54]:
scaler = StandardScaler()
num_X_train_scaled = pd.DataFrame(scaler.fit_transform(num_X_train))
num_X_train_scaled.columns = num_X_train.columns
num_X_train_scaled.index = num_X_train.index

num_X_test_scaled = pd.DataFrame(scaler.transform(num_X_test))
num_X_test_scaled.columns = num_X_test.columns
num_X_test_scaled.index = num_X_test.index

### Merge categorical and numerical variables

In [55]:
X_train = pd.concat([num_X_train_scaled, X_train_oe], axis=1)
X_test = pd.concat([num_X_test_scaled, X_test_oe], axis=1)

## Models

### Linear Regression

In [56]:
lm = LinearRegression()
lm_model = lm.fit(X_train, y_train)
pred_lm = lm_model.predict(X_train)

In [57]:
model_performance(pred_lm)

RMSE: 1247.355297291225
R2: 0.9501698583709396


In [58]:
cross_validation_scores(lm_model)

Scores: [1242.40776787 1261.36725906 1261.64915597 1240.43633245 1261.79709008
 1298.06043152 1172.19660381 1249.45733614 1248.99998064 1237.08181476]
Mean: 1247.345377230406
SD: 30.02344387881066


In [59]:
print(X_train.shape, y_train.shape)

(43133, 4) (43133,)


### Polynomial regression

In [60]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3)


In [61]:
X_train_poly = poly.fit_transform(X_train)
print(X_train_poly.shape)

(43133, 35)


In [62]:
pr = LinearRegression()

In [63]:
pr.fit(X_train_poly, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [64]:
X_train_poly = poly.fit_transform(X_train)

In [65]:
pred_pr = pr.predict(X_train_poly)

In [66]:
model_performance(pred_pr)

RMSE: 672.962472045801
R2: 0.9857557339337831


### Batch gradient descent

In [67]:
from sklearn.linear_model import SGDRegressor
sgd_reg  = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.01)
sgd_reg.fit(X_train,y_train)
pred_bg = sgd_reg.predict(X_train)

In [68]:
model_performance(pred_bg)

RMSE: 1253.2421174975966
R2: 0.9496859221542275


### Random Forest Regressor

In [69]:
rr  = RandomForestRegressor()
rr.fit(X_train,y_train)
pred_rr = rr.predict(X_train)

In [78]:
X_train.columns

Index(['carat', 'cut', 'color', 'clarity'], dtype='object')

In [79]:
for name, score in zip(X_train.columns, rr.feature_importances_):
    print(name, score)


carat 0.8924265144931934
cut 0.00527534326581697
color 0.03440417833392357
clarity 0.06789396390706592


In [70]:
model_performance(pred_rr)

RMSE: 386.8554339088219
R2: 0.9953154265521508


In [71]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rr, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
rr_rmse_scores = np.sqrt(-scores)

In [72]:
display_scores(rr_rmse_scores)

Scores: [580.37372872 567.66323808 594.17640496 590.96501367 600.36669623
 594.40403794 575.09833078 571.53291557 606.75623018 625.53532795]
Mean: 590.6871924090699
SD: 16.84854588182465


{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}

In [73]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30, 50, 100, 500], 'max_features': ['auto', 'sqrt', 'log2']}    
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, 
                          scoring='neg_mean_squared_error',
                          return_train_score=True,
                          n_jobs=-1)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [74]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='log2', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [75]:
cvres = grid_search.cv_results_

In [76]:
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

637.5271250961597 {'max_features': 'auto', 'n_estimators': 3}
610.0751310002712 {'max_features': 'auto', 'n_estimators': 10}
601.9481186272553 {'max_features': 'auto', 'n_estimators': 30}
599.8725456706911 {'max_features': 'auto', 'n_estimators': 50}
597.3059888576843 {'max_features': 'auto', 'n_estimators': 100}
596.5287003046673 {'max_features': 'auto', 'n_estimators': 500}
641.5186901377323 {'max_features': 'sqrt', 'n_estimators': 3}
606.4320884521677 {'max_features': 'sqrt', 'n_estimators': 10}
595.5685110275928 {'max_features': 'sqrt', 'n_estimators': 30}
594.8087241408857 {'max_features': 'sqrt', 'n_estimators': 50}
593.5721997643434 {'max_features': 'sqrt', 'n_estimators': 100}
592.061094347972 {'max_features': 'sqrt', 'n_estimators': 500}
640.9703425749484 {'max_features': 'log2', 'n_estimators': 3}
610.5164393695574 {'max_features': 'log2', 'n_estimators': 10}
595.5998318834471 {'max_features': 'log2', 'n_estimators': 30}
595.3192851838484 {'max_features': 'log2', 'n_estimator

In [80]:
import xgboost

ModuleNotFoundError: No module named 'xgboost'