## Build Random Forest Regressor Model for App

### 1. Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
import joblib
import os
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

### 2. Load the dataset

In [2]:
DATA_DIR = '../data'
FILE_NAME = 'diamonds.csv'
data_path = os.path.join(DATA_DIR, FILE_NAME)
diamonds = pd.read_csv(data_path)

### 3. Preparing the dataset

In [3]:
diamonds = diamonds.loc[(diamonds['x']>0) | (diamonds['y']>0)]
diamonds.loc[11182, 'x'] = diamonds['x'].median()
diamonds.loc[11182, 'z'] = diamonds['z'].median()
diamonds = diamonds.loc[~((diamonds['y'] > 30) | (diamonds['z'] > 30))]
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['cut'], prefix='cut', drop_first=True)], axis=1)
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['color'], prefix='color', drop_first=True)], axis=1)
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['clarity'], prefix='clarity', drop_first=True)], axis=1)

### 4. Dimensionality reduction using PCA

In [4]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1, random_state=123)
diamonds['dim_index'] = pca.fit_transform(diamonds[['x','y','z']])
diamonds.drop(['x','y','z'], axis=1, inplace=True)

### 5. Produce the objects to train the model

In [5]:
## We will use the whole dataset for training since we have decided on the model to use

# Create X and y 
X = diamonds.drop(['cut','color','clarity','price'], axis=1)
y = np.log(diamonds['price'])

## Standardization: centering and scaling
numerical_features = ['carat', 'depth', 'table', 'dim_index']
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X.loc[:, numerical_features] = scaler.fit_transform(X[numerical_features])

### 5. Build Random Forest Regressor

In [6]:
from sklearn.ensemble import RandomForestRegressor

RF = RandomForestRegressor()
RF.fit(X, y)

RandomForestRegressor()

### 6. Serialize the objects created

In [7]:
## Serializing:
# PCA
joblib.dump(pca, 'pca.joblib') 

# Scaler
joblib.dump(scaler, 'scaler.joblib')

# Trained model
joblib.dump(RF, 'diamond-prices-mode.joblib')

['../App/diamond-prices-model1.joblib']

In [8]:
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,dim_index
0,0.23,Ideal,E,SI2,61.5,55.0,326,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,-2.735311
1,0.21,Premium,E,SI1,59.8,61.0,326,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,-2.912708
2,0.23,Good,E,VS1,56.9,65.0,327,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,-2.660075
3,0.29,Premium,I,VS2,62.4,58.0,334,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,-2.331296
4,0.31,Good,J,SI2,63.3,58.0,335,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,-2.114768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.012841
53936,0.72,Good,D,SI1,63.1,55.0,2757,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.011299
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,-0.073404
53938,0.86,Premium,H,SI2,61.0,58.0,2757,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0.601467
