# Pre-Processing and Training Data Dev for Capstone 3 - Diamond Price Data

In [1]:
# Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics

### Import Files from EDA

In [2]:
# Read Files into Dataframes

diamonds_df = pd.read_csv('../Data/diamonds_4preproc.csv')

In [3]:
diamonds_df.sample(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price,pricePerCarat,carat_bins,carat_cats,cut_vals,color_vals,clarity_vals,symmetry_val
22788,1.74,Very Good,I,SI1,58.7,60.0,7.84,7.9,4.62,10823,6220.114943,"(1.49, 1.99]",1.50-1.99,3,2,3,0.992405
41211,0.4,Very Good,E,IF,62.9,58.0,4.68,4.77,2.97,1207,3017.5,"(0.0, 0.49]",<0.49,3,6,8,0.981132
29126,0.33,Ideal,E,VS1,61.5,56.0,4.45,4.5,2.75,692,2096.969697,"(0.0, 0.49]",<0.49,5,6,5,0.988889
40149,0.38,Ideal,F,VVS2,61.0,57.0,4.69,4.72,2.87,1117,2939.473684,"(0.0, 0.49]",<0.49,5,5,6,0.993644
14377,0.3,Premium,H,VVS2,62.4,58.0,4.26,4.3,2.67,605,2016.666667,"(0.0, 0.49]",<0.49,4,3,6,0.990698
16218,1.01,Premium,E,VS2,62.4,60.0,6.39,6.43,4.0,6488,6423.762376,"(0.99, 1.49]",1.00-1.49,4,6,4,0.993779
30338,0.3,Very Good,D,VVS2,59.0,59.0,4.4,4.42,2.6,729,2430.0,"(0.0, 0.49]",<0.49,3,7,6,0.995475
15151,1.04,Very Good,G,VS2,62.3,54.0,6.49,6.55,4.06,6095,5860.576923,"(0.99, 1.49]",1.00-1.49,3,4,4,0.99084
39633,0.4,Ideal,G,VS1,61.4,55.0,4.77,4.78,2.93,1087,2717.5,"(0.0, 0.49]",<0.49,5,4,5,0.997908
1794,0.7,Very Good,G,VVS2,60.2,61.0,5.66,5.74,3.43,3052,4360.0,"(0.49, 0.99]",0.50-0.99,3,4,6,0.986063


In [4]:
diamonds_df2 = diamonds_df.copy()

### Drop Encoded Columns and Get Dummies

In [5]:
# Drop Encoded Columns

cols = ['carat_bins', 'cut_vals', 'color_vals', 'clarity_vals']

diamonds_df = diamonds_df.drop(diamonds_df[cols], axis=1)

diamonds_df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price,pricePerCarat,carat_cats,symmetry_val
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326,1417.391304,<0.49,0.992462
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326,1552.380952,<0.49,1.013021
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327,1421.739130,<0.49,0.995086
3,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63,334,1151.724138,<0.49,0.992908
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335,1080.645161,<0.49,0.997701
...,...,...,...,...,...,...,...,...,...,...,...,...,...
53912,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.50,2757,3829.166667,0.50-0.99,0.998264
53913,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61,2757,3829.166667,0.50-0.99,0.989565
53914,0.70,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56,2757,3938.571429,0.50-0.99,0.996479
53915,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74,2757,3205.813953,0.50-0.99,1.004902


In [6]:
# Get Dummies For, and Drop Categorical columns

dummyCut = pd.get_dummies(diamonds_df['cut'],drop_first=True)
dummyColor = pd.get_dummies(diamonds_df['color'],drop_first=True)
dummyClarity = pd.get_dummies(diamonds_df['clarity'],drop_first=True)
dummyCaratCats = pd.get_dummies(diamonds_df['carat_cats'], drop_first=True)

diamonds_df = pd.concat([diamonds_df,dummyCut,dummyColor,dummyClarity],axis=1)

diamonds_df.sample(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price,...,H,I,J,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
18372,0.32,Very Good,E,VS1,61.6,56.0,4.4,4.46,2.73,618,...,0,0,0,0,0,0,1,0,0,0
47061,0.52,Ideal,D,VS2,62.7,53.0,5.14,5.17,3.23,1822,...,0,0,0,0,0,0,0,1,0,0
39911,0.31,Ideal,E,VVS1,61.5,56.0,4.34,4.37,2.68,1105,...,0,0,0,0,0,0,0,0,1,0
28611,0.26,Ideal,H,VS2,61.3,56.0,4.09,4.13,2.52,434,...,1,0,0,0,0,0,0,1,0,0
15131,1.43,Fair,E,SI2,66.0,57.0,7.02,7.0,4.63,6086,...,0,0,0,0,0,1,0,0,0,0
9696,0.32,Very Good,D,SI1,63.0,58.0,4.36,4.4,2.76,589,...,0,0,0,0,1,0,0,0,0,0
35740,0.41,Very Good,D,VS2,62.1,53.0,4.78,4.82,2.98,912,...,0,0,0,0,0,0,0,1,0,0
44848,0.73,Good,E,I1,56.9,60.0,5.98,5.93,3.39,1628,...,0,0,0,0,0,0,0,0,0,0
28818,0.41,Very Good,H,SI1,62.9,58.0,4.72,4.73,2.97,683,...,1,0,0,0,1,0,0,0,0,0
21326,1.57,Premium,I,VS2,62.2,59.0,7.44,7.42,4.62,9478,...,0,1,0,0,0,0,0,1,0,0


In [8]:
# Drop Categorical columns

diamonds_df = diamonds_df.drop(['cut','color','clarity','carat_cats', 'pricePerCarat'], axis=1)

diamonds_df.sample(5)

Unnamed: 0,carat,depth,table,x,y,z,price,symmetry_val,Good,Ideal,...,H,I,J,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
36344,0.33,62.5,56.0,4.43,4.44,2.77,939,0.997748,0,1,...,0,0,0,0,0,0,0,0,1,0
4597,1.02,62.9,57.0,6.41,6.37,4.02,3656,1.006279,0,1,...,0,1,0,0,1,0,0,0,0,0
16094,1.01,59.4,58.0,6.46,6.54,3.86,6439,0.987768,0,0,...,0,0,0,0,0,0,1,0,0,0
30816,0.33,60.2,57.0,4.54,4.5,2.72,743,1.008889,0,1,...,1,0,0,0,0,0,1,0,0,0
4410,1.01,58.8,58.0,6.52,6.57,3.85,3610,0.99239,0,0,...,0,1,0,0,1,0,0,0,0,0


### Create Training and Test Splits

In [9]:
# Create Training and Test sets for diamonds_df

X = diamonds_df.drop('price',axis=1)
y = diamonds_df['price']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [11]:
X_train, X_test

(       carat  depth  table     x     y     z  symmetry_val  Good  Ideal  \
 3470    0.76   61.5   59.0  5.83  5.91  3.61      0.986464     0      0   
 40313   0.50   61.4   63.0  5.06  5.10  3.12      0.992157     0      0   
 22103   1.24   60.9   54.0  6.98  6.95  4.24      1.004317     0      1   
 29329   0.31   62.4   54.0  4.39  4.36  2.73      1.006881     0      1   
 25145   2.07   61.7   60.0  8.23  8.15  5.05      1.009816     0      0   
 ...      ...    ...    ...   ...   ...   ...           ...   ...    ...   
 40820   0.50   62.9   58.0  5.04  5.01  3.16      1.005988     0      0   
 52543   0.70   62.9   58.0  5.67  5.65  3.56      1.003540     0      0   
 29668   0.40   61.5   60.0  4.77  4.70  2.91      1.014894     0      0   
 27430   2.01   61.8   60.0  7.99  8.04  4.95      0.993781     0      0   
 45707   0.72   63.3   57.0  5.70  5.67  3.60      1.005291     0      0   
 
        Premium  ...  H  I  J  IF  SI1  SI2  VS1  VS2  VVS1  VVS2  
 3470         0  .

### Export Train/Test Files for Modeling

In [None]:
# Export Files


X_train.to_csv('../Data/Proc/X_train.csv', index=False)
X_test.to_csv('../Data/Proc/X_test.csv', index=False)
y_train.to_csv('../Data/Proc/y_train.csv', index=False)
y_test.to_csv('../Data/Proc/y_test.csv', index=False)
