In [65]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

In [66]:
diamond_filepath = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vQi-lpl1Sw68Lk5-O2u1dTkJG2ejS0-1oMm5X301d9SNtyWDI2aiupkELkSEphW5qda4OvU7C2ztJwS/pub?output=csv'

In [67]:
# Load in the data
df = pd.read_csv(diamond_filepath)

In [68]:
# Take a look at the first five rows of the dataframe
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326.0,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326.0,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327.0,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334.0,4.2,4.23,2.63
4,5,,Good,J,SI2,63.3,58.0,335.0,4.34,4.35,2.75


In [69]:
# Look at the info from the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  53940 non-null  int64  
 1   carat       53920 non-null  float64
 2   cut         53919 non-null  object 
 3   color       53932 non-null  object 
 4   clarity     53936 non-null  object 
 5   depth       53934 non-null  float64
 6   table       53935 non-null  float64
 7   price       53934 non-null  float64
 8   x           53935 non-null  float64
 9   y           53934 non-null  float64
 10  z           53934 non-null  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 4.5+ MB


In [70]:
# Check to see if there are any duplicates
df.duplicated().sum()

0

In [71]:
# split X and y, you are predicting price
X = df.drop(columns= ['Unnamed: 0', 'price'])
y = df['price']
# split training and test
# set random_state to 42 for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

In [72]:
# instantiate the column selectors
num_selector = make_column_selector(dtype_include= 'number')
cat_selector = make_column_selector(dtype_include= 'object')

mean_imputer = SimpleImputer(strategy = 'mean')
freq_imputer = SimpleImputer(strategy = 'most_frequent')

In [73]:
#instantiate the standard scaler, OneHotEncoder and Imputers
scaler = StandardScaler()
encoder = OneHotEncoder(sparse = False, handle_unknown='ignore')

In [74]:
# Setup the pipelines for the numeric and categorical data
num_processor = make_pipeline(mean_imputer, scaler)
cat_processor = make_pipeline(freq_imputer, encoder)


In [75]:
# Setup the tuples to pair the processors with the make column selecotrs
num_tuple = (num_processor, num_selector)
cat_tuple = (cat_processor, cat_selector)


In [76]:
# Instantiate the make column transformer
col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough')


In [77]:
# Fit the column transformer on the X_train
X_train_processed = col_transformer.fit(X_train)


In [78]:
# Get out the column transformer steps
col_transformer.named_transformers_

{'pipeline-1': Pipeline(steps=[('simpleimputer', SimpleImputer()),
                 ('standardscaler', StandardScaler())]),
 'pipeline-2': Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                 ('onehotencoder',
                  OneHotEncoder(handle_unknown='ignore', sparse=False))])}

In [79]:
# Reference the pipeline that has the one hot encoder
col_transformer.named_transformers_['pipeline-2']

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                ('onehotencoder',
                 OneHotEncoder(handle_unknown='ignore', sparse=False))])

In [80]:
# Display the feature names out from the one hot encoder
cat_pipe.named_steps['onehotencoder'].get_feature_names_out(cat_selector(X_train))

array(['cut_Fair', 'cut_Good', 'cut_Ideal', 'cut_Premium',
       'cut_Very Good', 'color_D', 'color_E', 'color_F', 'color_G',
       'color_H', 'color_I', 'color_J', 'clarity_I1', 'clarity_IF',
       'clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2',
       'clarity_VVS1', 'clarity_VVS2'], dtype=object)

In [83]:
# Extract the feature names from one hot encoder
cat_feature_names = col_transformer.named_transformers_['pipeline-2'].named_steps['onehotencoder'].get_feature_names_out(cat_selector(X_train))

cat_feature_names

array(['cut_Fair', 'cut_Good', 'cut_Ideal', 'cut_Premium',
       'cut_Very Good', 'color_D', 'color_E', 'color_F', 'color_G',
       'color_H', 'color_I', 'color_J', 'clarity_I1', 'clarity_IF',
       'clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2',
       'clarity_VVS1', 'clarity_VVS2'], dtype=object)

In [84]:
## get final col names which are num features + cat features
final_cols = num_selector(X_train) + list(cat_feature_names)
final_cols

['carat',
 'depth',
 'table',
 'x',
 'y',
 'z',
 'cut_Fair',
 'cut_Good',
 'cut_Ideal',
 'cut_Premium',
 'cut_Very Good',
 'color_D',
 'color_E',
 'color_F',
 'color_G',
 'color_H',
 'color_I',
 'color_J',
 'clarity_I1',
 'clarity_IF',
 'clarity_SI1',
 'clarity_SI2',
 'clarity_VS1',
 'clarity_VS2',
 'clarity_VVS1',
 'clarity_VVS2']

In [85]:
# Transform the X_train and the X_test
X_train_transformed = col_transformer.transform(X_train)
X_test_transformed = col_transformer.transform(X_test)


In [86]:
# Change the X_train and X_test transformed columns to a dataframe
X_train_output = pd.DataFrame(X_train_transformed, columns = final_cols)
X_test_output = pd.DataFrame(X_test_transformed, columns = final_cols)

In [87]:
# View the first five rows of the X_train transformed dataframe
X_train_output.head()


Unnamed: 0,carat,depth,table,x,y,z,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,-1.156764,2.207835,0.242456,-1.590143,-1.544522,-1.365942,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.086861,0.038503,-0.654927,0.273488,0.291459,0.282087,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.529507,-0.451347,0.242456,0.737166,0.676141,0.63423,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.466271,-0.731261,-0.654927,0.710416,0.667398,0.577887,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-0.397942,0.038503,-0.206236,-0.270443,-0.233107,-0.239085,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [88]:
# View the first five rows of the X_test transformed dataframe

X_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
1388,0.24,Ideal,G,VVS1,62.1,56.0,3.97,4.0,2.47
50052,0.58,Very Good,F,VVS2,60.0,57.0,5.44,5.42,3.26
41645,0.4,Ideal,E,VVS2,62.1,55.0,4.76,4.74,2.95
42377,0.43,Premium,E,VVS2,60.8,57.0,4.92,4.89,2.98
17244,1.55,Ideal,E,SI2,62.3,55.0,7.44,7.37,4.61


In [89]:
# View the info from the X_train transformed dataframe
X_train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 40455 entries, 35965 to 15795
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    40443 non-null  float64
 1   cut      40438 non-null  object 
 2   color    40449 non-null  object 
 3   clarity  40451 non-null  object 
 4   depth    40452 non-null  float64
 5   table    40451 non-null  float64
 6   x        40450 non-null  float64
 7   y        40451 non-null  float64
 8   z        40451 non-null  float64
dtypes: float64(6), object(3)
memory usage: 3.1+ MB


In [90]:
# View the info from the X_test transformed dataframe
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13485 entries, 1388 to 4204
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    13477 non-null  float64
 1   cut      13481 non-null  object 
 2   color    13483 non-null  object 
 3   clarity  13485 non-null  object 
 4   depth    13482 non-null  float64
 5   table    13484 non-null  float64
 6   x        13485 non-null  float64
 7   y        13483 non-null  float64
 8   z        13483 non-null  float64
dtypes: float64(6), object(3)
memory usage: 1.0+ MB
