In [2]:
import pandas as pd

## Model Training

In [3]:
df = pd.read_csv('data/gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
df= df.drop(labels=['id'], axis=1)


In [5]:
## Independent and Dependent Features
X = df.drop(labels=['price'], axis=1)
Y = df[['price']]

In [6]:
Y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [7]:
# Defining which columns should be ordinal encoded and which should be scaled.
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns


In [8]:
# Definig the Custom Ranking for each ordinal variable.
cut_categories = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

In [9]:
from sklearn.impute import SimpleImputer # Handling Missing Values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding

# Pipelines
from sklearn. pipeline import Pipeline # connecting imputer,scaler and encoder
from sklearn.compose import ColumnTransformer # grouping them

In [12]:
# Numerical Pipeline
num_pipeline = Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    
    ]

)

# Categorical Pipeline
cat_pipeline = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaler', StandardScaler())

    ]

)

preprocessor= ColumnTransformer([    
('num_pipeline', num_pipeline, numerical_cols),
('cat_pipeline', cat_pipeline, categorical_cols)

])