## Build Random Forest Regressor Model for App

### 1. Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
import joblib
import pickle
import os
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

### 2. Load the dataset

In [2]:
DATA_DIR = '../data'
FILE_NAME = 'diamonds.csv'
data_path = os.path.join(DATA_DIR, FILE_NAME)
diamonds = pd.read_csv(data_path)

### 3. Preparing the dataset

In [3]:
diamonds = diamonds.loc[(diamonds['x']>0) | (diamonds['y']>0)]
diamonds.loc[11182, 'x'] = diamonds['x'].median()
diamonds.loc[11182, 'z'] = diamonds['z'].median()
diamonds = diamonds.loc[~((diamonds['y'] > 30) | (diamonds['z'] > 30))]
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['cut'], prefix='cut', drop_first=True)], axis=1)
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['color'], prefix='color', drop_first=True)], axis=1)
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['clarity'], prefix='clarity', drop_first=True)], axis=1)

### 4. Dimensionality reduction using PCA

In [4]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1, random_state=123)
diamonds['dim_index'] = pca.fit_transform(diamonds[['x','y','z']])
diamonds.drop(['x','y','z'], axis=1, inplace=True)

### 5. Produce the objects to train the model

In [5]:
## We will use the whole dataset for training since we have decided on the model to use

# Create X and y 
X = diamonds.drop(['cut','color','clarity','price'], axis=1)
y = np.log(diamonds['price'])

## Standardization: centering and scaling
numerical_features = ['carat', 'depth', 'table', 'dim_index']
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X.loc[:, numerical_features] = scaler.fit_transform(X[numerical_features])

### 5. Build Random Forest Regressor

In [6]:
from sklearn.ensemble import RandomForestRegressor

RF = RandomForestRegressor()
RF.fit(X, y)

RandomForestRegressor()

### 6. Serialize the objects created

In [7]:
## Serializing:
# PCA
joblib.dump(pca, 'pca.joblib') 

# Scaler
joblib.dump(scaler, 'scaler.joblib')

# Trained model
joblib.dump(RF, 'diamond-prices-model.joblib', compress=9)
#pkl_filename = "diamond-prices-model.pkl"
#with open(pkl_filename, 'wb') as file:
#    pickle.dump(RF, file)


In [8]:
X

Unnamed: 0,carat,depth,table,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,dim_index
0,-1.198145,-0.174093,-1.099851,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,-1.590285
1,-1.240342,-1.360850,1.585871,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,-1.693422
2,-1.198145,-3.385318,3.376352,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,-1.546543
3,-1.071552,0.454190,0.243010,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,-1.355394
4,-1.029354,1.082473,0.243010,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,-1.229507
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53935,-0.164302,-0.662758,-0.204611,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.007466
53936,-0.164302,0.942855,-1.099851,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.006569
53937,-0.206500,0.733427,1.138250,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,-0.042676
53938,0.131081,-0.523140,0.243010,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0.349688
