# Pre-Processing and Training Data Dev for Capstone 3 - Diamond Price Data

In [1]:
# Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics

### Import Files from EDA

In [2]:
# Read Files into Dataframes

diamonds_df = pd.read_csv('../Data/diamonds_4preproc.csv')

In [3]:
diamonds_df.sample(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price,pricePerCarat,carat_bins,carat_cats,cut_vals,color_vals,clarity_vals,symmetry_val
11485,1.13,Very Good,F,SI1,61.0,58.0,6.69,6.75,4.1,5017,4439.823009,"(0.99, 1.49]",1.00-1.49,3,5,3,0.991111
39090,0.41,Premium,H,VS1,62.6,60.0,4.77,4.75,2.98,1061,2587.804878,"(0.0, 0.49]",<0.49,4,3,5,1.004211
18092,1.21,Premium,G,VS2,62.2,58.0,6.78,6.88,4.25,7320,6049.586777,"(0.99, 1.49]",1.00-1.49,4,4,4,0.985465
13776,1.03,Very Good,H,VS2,59.9,59.0,6.5,6.52,3.9,5620,5456.31068,"(0.99, 1.49]",1.00-1.49,3,3,4,0.996933
6932,0.93,Premium,D,SI2,62.6,57.0,6.26,6.2,3.9,4140,4451.612903,"(0.49, 0.99]",0.50-0.99,4,7,2,1.009677
11976,1.06,Very Good,F,SI1,61.2,57.0,6.58,6.68,4.06,5142,4850.943396,"(0.99, 1.49]",1.00-1.49,3,5,3,0.98503
27933,0.3,Very Good,G,SI1,62.1,58.0,4.26,4.28,2.65,432,1440.0,"(0.0, 0.49]",<0.49,3,4,3,0.995327
10565,1.11,Good,I,VS1,59.2,64.0,6.8,6.77,4.02,4817,4339.63964,"(0.99, 1.49]",1.00-1.49,2,2,5,1.004431
28175,0.4,Ideal,H,SI1,62.0,56.0,4.68,4.74,2.92,666,1665.0,"(0.0, 0.49]",<0.49,5,3,3,0.987342
8926,1.1,Good,I,SI1,63.7,59.0,6.56,6.5,4.16,4497,4088.181818,"(0.99, 1.49]",1.00-1.49,2,2,3,1.009231


In [4]:
diamonds_df2 = diamonds_df.copy()

### Drop Encoded Columns and Get Dummies

In [5]:
# Drop Encoded Columns

cols = ['carat_bins', 'cut_vals', 'color_vals', 'clarity_vals']

diamonds_df = diamonds_df.drop(diamonds_df[cols], axis=1)

diamonds_df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price,pricePerCarat,carat_cats,symmetry_val
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326,1417.391304,<0.49,0.992462
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326,1552.380952,<0.49,1.013021
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327,1421.739130,<0.49,0.995086
3,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63,334,1151.724138,<0.49,0.992908
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335,1080.645161,<0.49,0.997701
...,...,...,...,...,...,...,...,...,...,...,...,...,...
53912,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.50,2757,3829.166667,0.50-0.99,0.998264
53913,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61,2757,3829.166667,0.50-0.99,0.989565
53914,0.70,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56,2757,3938.571429,0.50-0.99,0.996479
53915,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74,2757,3205.813953,0.50-0.99,1.004902


In [6]:
# Get Dummies For, and Drop Categorical columns

dummyCut = pd.get_dummies(diamonds_df['cut'],drop_first=True)
dummyColor = pd.get_dummies(diamonds_df['color'],drop_first=True)
dummyClarity = pd.get_dummies(diamonds_df['clarity'],drop_first=True)
dummyCaratCats = pd.get_dummies(diamonds_df['carat_cats'], drop_first=True)

diamonds_df = pd.concat([diamonds_df,dummyCut,dummyColor,dummyClarity],axis=1)

diamonds_df.sample(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price,...,H,I,J,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
38612,0.31,Premium,I,VS1,62.3,59.0,4.29,4.35,2.69,489,...,0,1,0,0,0,0,1,0,0,0
9437,0.9,Ideal,H,VS1,61.6,56.0,6.17,6.13,3.79,4592,...,1,0,0,0,0,0,1,0,0,0
40319,0.54,Very Good,J,VS2,63.0,57.0,5.17,5.21,3.27,1129,...,0,0,1,0,0,0,0,1,0,0
50773,0.71,Good,I,VS1,63.3,56.0,5.65,5.69,3.59,2306,...,0,1,0,0,0,0,1,0,0,0
1677,1.02,Premium,G,SI2,61.7,58.0,6.46,6.41,3.97,3027,...,0,0,0,0,0,1,0,0,0,0
29168,0.3,Ideal,E,VS1,61.3,58.0,4.29,4.32,2.64,694,...,0,0,0,0,0,0,1,0,0,0
4699,1.13,Ideal,H,I1,62.2,55.0,6.65,6.69,4.15,3678,...,1,0,0,0,0,0,0,0,0,0
30241,0.4,Very Good,E,SI1,63.0,57.0,4.68,4.71,2.96,725,...,0,0,0,0,1,0,0,0,0,0
51618,0.31,Good,G,VS2,63.1,58.0,4.3,4.35,2.73,544,...,0,0,0,0,0,0,0,1,0,0
51025,0.73,Ideal,H,SI1,62.8,57.0,5.77,5.72,3.61,2330,...,1,0,0,0,1,0,0,0,0,0


In [7]:
# Drop Categorical columns

diamonds_df = diamonds_df.drop(['cut','color','clarity','carat_cats', 'pricePerCarat'], axis=1)

diamonds_df.sample(5)

Unnamed: 0,carat,depth,table,x,y,z,price,symmetry_val,Good,Ideal,...,H,I,J,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
41029,0.51,62.0,56.0,5.13,5.16,3.19,1188,0.994186,0,1,...,0,0,0,0,0,1,0,0,0,0
12843,1.34,62.0,55.0,7.06,7.03,4.37,5358,1.004267,0,1,...,0,0,0,0,0,1,0,0,0,0
22430,1.53,61.0,61.0,7.4,7.45,4.53,10468,0.993289,0,0,...,1,0,0,0,1,0,0,0,0,0
38842,0.4,61.1,56.0,4.78,4.75,2.91,1050,1.006316,0,1,...,0,0,0,0,0,0,1,0,0,0
20807,1.55,62.0,58.0,7.44,7.39,4.6,9044,1.006766,0,1,...,1,0,0,0,0,1,0,0,0,0


### Create Training and Test Splits

In [8]:
# Create Training and Test sets for diamonds_df

X = diamonds_df.drop('price',axis=1)
y = diamonds_df['price']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [10]:
X_train, X_test

(       carat  depth  table     x     y     z  symmetry_val  Good  Ideal  \
 23432   1.07   62.0   57.0  6.52  6.57  4.06      0.992390     0      1   
 16925   1.02   59.1   60.0  6.61  6.51  3.88      1.015361     0      0   
 45086   0.59   62.2   59.0  5.38  5.34  3.33      1.007491     0      1   
 53541   1.00   61.3   60.0  6.39  6.43  3.93      0.993779     0      0   
 14854   1.44   63.2   54.8  7.18  7.21  4.54      0.995839     1      0   
 ...      ...    ...    ...   ...   ...   ...           ...   ...    ...   
 44233   0.51   62.3   61.0  5.11  5.07  3.17      1.007890     0      0   
 48575   0.70   61.0   66.0  5.68  5.64  3.45      1.007092     0      0   
 50874   0.79   61.8   60.0  5.96  5.92  3.67      1.006757     0      0   
 9605    1.02   59.2   57.0  6.52  6.56  3.87      0.993902     0      1   
 14626   1.00   64.4   56.0  6.31  6.26  4.05      1.007987     0      0   
 
        Premium  ...  H  I  J  IF  SI1  SI2  VS1  VS2  VVS1  VVS2  
 23432        0  .

### Export Train/Test Files for Modeling

In [11]:
# Export Files


X_train.to_csv('../Data/Proc/X_train.csv', index=False)
X_test.to_csv('../Data/Proc/X_test.csv', index=False)
y_train.to_csv('../Data/Proc/y_train.csv', index=False)
y_test.to_csv('../Data/Proc/y_test.csv', index=False)
