In [1]:
# 1. Load dataset

import seaborn as sns
df=sns.load_dataset('diamonds')
df.tail()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.5
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.7,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74
53939,0.75,Ideal,D,SI2,62.2,55.0,2757,5.83,5.87,3.64


In [2]:
df['cut'].unique()

['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']
Categories (5, object): ['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']

In [3]:
# 2. Making dependent and independent features
X = df.drop('cut' , axis = 1)

In [4]:
y = df['cut']

In [5]:
# 3. Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.33)

In [13]:
# 4. Pipelining 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [7]:
# 5. Cat and Num columns 
categorical_columns = ['clarity' , 'color']
numerical_columns = ['carat' , 'depth' , 'table' , 'price' , 'x' , 'y' , 'z']

In [8]:
# 6. Create Transformers
numerical_pipeline = Pipeline(
    steps = [
        ('imputer' , SimpleImputer(strategy='mean')),
        ('scaler' , StandardScaler())
    ]
)
categorical_pipeline = Pipeline(
    steps = [
        ('imputer' , SimpleImputer(strategy='most_frequent')),
        ('scaler' , OneHotEncoder())
    ]
)

In [9]:
# 7. Combine Transformers
preprocessor = ColumnTransformer([
    ('num_pipeline' , numerical_pipeline , numerical_columns),
    ('cat_pipeline' , categorical_pipeline , categorical_columns),
])

In [10]:
# X_train = preprocessor.fit_transform(X_train)
# X_test = preprocessor.transform(X_test)

In [11]:
# 8. Create a pipeline with feature selection and model training steps
pipeline = Pipeline([
    ('preprocessor' , preprocessor),
    ('feature_selection' , SelectFromModel(RandomForestClassifier())),
    ('classifier', RandomForestClassifier())
])

In [12]:
# 9. Traning the model
pipeline.fit(X_train , y_train)

In [14]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test , y_pred)

0.7820347171507218