# Pre-Processing and Training Data Dev for Capstone 3 - Diamond Price Data

In [1]:
# Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics

### Import Files from EDA

In [2]:
# Read Files into Dataframes

diamonds_df = pd.read_csv('../Data/diamonds_4preproc.csv')

In [3]:
diamonds_df.sample(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price,pricePerCarat,carat_bins,carat_cats,cut_vals,color_vals,clarity_vals,symmetry_val
44240,0.54,Ideal,G,SI1,59.6,59.0,5.34,5.3,3.17,1574,2914.814815,"(0.49, 0.99]",0.50-0.99,5,4,3,1.007547
50190,0.8,Good,D,SI2,64.0,60.0,5.91,5.81,3.75,2229,2786.25,"(0.49, 0.99]",0.50-0.99,2,7,2,1.017212
22461,1.51,Premium,G,SI1,62.2,58.0,7.26,7.3,4.53,10497,6951.655629,"(1.49, 1.99]",1.50-1.99,4,4,3,0.994521
8571,1.0,Fair,G,VS2,69.8,54.0,6.03,5.94,4.18,4435,4435.0,"(0.99, 1.49]",1.00-1.49,1,4,4,1.015152
14633,1.33,Very Good,J,VS2,63.9,57.0,6.91,6.96,4.43,5913,4445.864662,"(0.99, 1.49]",1.00-1.49,3,1,4,0.992816
48187,0.7,Premium,H,SI1,61.1,58.0,5.74,5.79,3.52,1948,2782.857143,"(0.49, 0.99]",0.50-0.99,4,3,3,0.991364
4934,0.9,Premium,H,SI1,60.7,59.0,6.26,6.19,3.78,3730,4144.444444,"(0.49, 0.99]",0.50-0.99,4,3,3,1.011309
25823,2.23,Premium,J,VS2,61.0,58.0,8.39,8.36,5.11,14867,6666.816143,"(1.99, 2.49]",2.00-2.49,4,1,4,1.003589
5851,0.7,Very Good,D,VVS1,61.5,63.0,5.78,5.64,3.51,3920,5600.0,"(0.49, 0.99]",0.50-0.99,3,7,7,1.024823
2087,0.79,Premium,D,SI1,61.4,59.0,5.89,5.96,3.64,3112,3939.240506,"(0.49, 0.99]",0.50-0.99,4,7,3,0.988255


In [None]:
diamonds_df2 = diamonds_df.copy()

### Drop Encoded Columns and Get Dummies

In [5]:
# Drop Encoded Columns

cols = ['carat_bins', 'cut_vals', 'color_vals', 'clarity_vals']

diamonds_df = diamonds_df.drop(diamonds_df[cols], axis=1)

diamonds_df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price,pricePerCarat,carat_bins,carat_cats,symmetry_val
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326,1417.391304,"(0.0, 0.49]",<0.49,0.992462
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326,1552.380952,"(0.0, 0.49]",<0.49,1.013021
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327,1421.739130,"(0.0, 0.49]",<0.49,0.995086
3,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63,334,1151.724138,"(0.0, 0.49]",<0.49,0.992908
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335,1080.645161,"(0.0, 0.49]",<0.49,0.997701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53912,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.50,2757,3829.166667,"(0.49, 0.99]",0.50-0.99,0.998264
53913,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61,2757,3829.166667,"(0.49, 0.99]",0.50-0.99,0.989565
53914,0.70,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56,2757,3938.571429,"(0.49, 0.99]",0.50-0.99,0.996479
53915,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74,2757,3205.813953,"(0.49, 0.99]",0.50-0.99,1.004902


In [6]:
# Get Dummies For, and Drop Categorical columns

dummyCut = pd.get_dummies(diamonds_df['cut'],drop_first=True)
dummyColor = pd.get_dummies(diamonds_df['color'],drop_first=True)
dummyClarity = pd.get_dummies(diamonds_df['clarity'],drop_first=True)
dummyCaratCats = pd.get_dummies(diamonds_df['carat_cats'], drop_first=True)

diamonds_df = pd.concat([diamonds_df,dummyCut,dummyColor,dummyClarity],axis=1)

diamonds_df.sample(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price,...,H,I,J,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
648,0.72,Ideal,E,VS2,62.0,57.0,5.71,5.74,3.55,2843,...,0,0,0,0,0,0,0,1,0,0
18933,1.21,Very Good,H,VS1,62.8,54.4,6.78,6.81,4.26,7786,...,1,0,0,0,0,0,1,0,0,0
6904,0.9,Very Good,H,VS1,61.8,56.0,6.16,6.18,3.81,4136,...,1,0,0,0,0,0,1,0,0,0
7576,1.0,Premium,F,SI2,61.5,58.0,6.38,6.41,3.93,4255,...,0,0,0,0,0,1,0,0,0,0
41147,0.38,Very Good,D,VVS2,60.0,62.0,4.7,4.73,2.83,1200,...,0,0,0,0,0,0,0,0,0,1
17868,1.41,Very Good,F,SI2,61.4,59.0,7.17,7.2,4.41,7215,...,0,0,0,0,0,1,0,0,0,0
48003,0.7,Good,J,VVS2,63.2,55.0,5.63,5.57,3.54,1922,...,0,0,1,0,0,0,0,0,0,1
18361,0.3,Ideal,D,VS2,62.4,55.0,4.3,4.32,2.69,616,...,0,0,0,0,0,0,0,1,0,0
29844,0.3,Ideal,D,VS2,62.2,55.0,4.29,4.33,2.68,710,...,0,0,0,0,0,0,0,1,0,0
36464,0.32,Ideal,F,IF,60.9,57.0,4.42,4.45,2.7,943,...,0,0,0,1,0,0,0,0,0,0


In [8]:
# Drop Categorical columns

diamonds_df = diamonds_df.drop(['cut','color','clarity','carat_bins','carat_cats', 'pricePerCarat'], axis=1)

diamonds_df.sample(5)

Unnamed: 0,carat,depth,table,x,y,z,price,pricePerCarat,carat_bins,carat_cats,...,H,I,J,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
36406,0.31,61.2,55.0,4.42,4.37,2.69,942,3038.709677,"(0.0, 0.49]",<0.49,...,0,0,0,0,0,0,0,1,0,0
6806,1.01,63.0,56.0,6.34,6.3,3.98,4118,4077.227723,"(0.99, 1.49]",1.00-1.49,...,0,0,0,0,0,1,0,0,0,0
50006,0.51,61.7,56.0,5.11,5.13,3.16,2197,4307.843137,"(0.49, 0.99]",0.50-0.99,...,0,0,0,0,0,0,0,0,0,1
53832,0.51,61.3,57.0,5.17,5.14,3.16,2742,5376.470588,"(0.49, 0.99]",0.50-0.99,...,0,0,0,0,0,0,0,0,0,1
23077,1.75,62.9,57.0,7.71,7.59,4.81,11113,6350.285714,"(1.49, 1.99]",1.50-1.99,...,0,0,0,0,0,1,0,0,0,0


### Create Training and Test Splits

In [None]:
# Create Training and Test sets for diamonds_df

X = diamonds_df.drop('price',axis=1)
y = diamonds_df['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# Create Training and Test sets for diamonds_df

X2 = diamonds_df2.drop('price',axis=1)
y2 = diamonds_df2['price']

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3)

### Export Train/Test Files for Modeling

In [None]:
# Export Files

'''
sp_df.to_csv('../Data/Proc/diamonds_train.csv', index=False)
p1_dt.to_csv('../Data/Proc/p1_dt.csv', index=False)
p2_dt.to_csv('../Data/Proc/p2_dt.csv', index=False)
'''