### Model training

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("cubic_zirconia.csv")
df.head(3)

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
2,3,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289


In [3]:
df = df.drop("Unnamed: 0", axis=1)

In [4]:
df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984


In [5]:
## Independent and dependent features

X = df.drop("price", axis=1)
y = df["price"]

In [6]:
# Define which column should be ordinal-encoded and which should be scaled

categorical_feature = X.select_dtypes(include="object").columns
numerical_feature = X.select_dtypes(exclude="object").columns

In [7]:
categorical_feature

Index(['cut', 'color', 'clarity'], dtype='object')

In [8]:
cut_categories = ["Fair", "Good", "Very Good", "Premium", "Ideal"]
color_categories = ["D", "E", "F", "G", "H", "I", "J"]
clarity_categories = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS1", "VVS2", "IF"]

### Pipeline creation

In [9]:
from sklearn.impute import SimpleImputer ## For handling missing values
from sklearn.preprocessing import StandardScaler ## For handling feature scaling
from sklearn.preprocessing import OrdinalEncoder ## For Categorical Ordinal Encoding

# Pipelines

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [10]:
## Numerical Pipeline

numerical_pipeline = Pipeline(
    steps = [("imputer",SimpleImputer(strategy="median")),
             ("scaler", StandardScaler())]
)


## Categorical Pipeline

categorical_pipeline = Pipeline(
    steps= [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ordinal_encoder", (OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories]))),
        ("scaler", StandardScaler())
    ]
)

In [11]:
preprocessor = ColumnTransformer([
    ("numerical_pipeline", numerical_pipeline, numerical_feature),
    ("categorical_pipeline", categorical_pipeline, categorical_feature)
])