<a href="https://colab.research.google.com/github/muskanalirizvi/Practice-sklearn/blob/main/ColumnTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
import numpy as np

# Seed for reproducibility
np.random.seed(42)

# Sample data
data = {
    'Country': np.random.choice(['USA', 'Canada', 'UK', 'Germany', 'France'], 100),
    'Gender': np.random.choice(['Male', 'Female', 'Other'], 100),
    'Job_Role': np.random.choice(['Engineer', 'Doctor', 'Artist', 'Lawyer', 'Scientist'], 100),
    'Education_Level': np.random.choice(['High School', 'Bachelors', 'Masters', 'PhD'], 100)
}

# Create the DataFrame
df = pd.DataFrame(data)
df.shape

(100, 4)

In [None]:
df.head()

Unnamed: 0,Country,Gender,Job_Role,Education_Level
0,Germany,Other,Engineer,Bachelors
1,France,Male,Doctor,High School
2,UK,Female,Doctor,PhD
3,France,Male,Lawyer,Masters
4,France,Other,Scientist,PhD


In [3]:
df.groupby('Education_Level').count()

Unnamed: 0_level_0,Country,Gender,Job_Role
Education_Level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bachelors,23,23,23
High School,27,27,27
Masters,21,21,21
PhD,29,29,29


In [4]:
df['Country'].value_counts()

Country
Germany    26
Canada     21
France     19
USA        18
UK         16
Name: count, dtype: int64

In [5]:
df['Job_Role'].value_counts()

Job_Role
Artist       26
Engineer     25
Lawyer       23
Doctor       15
Scientist    11
Name: count, dtype: int64

In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [25]:
transformed = ColumnTransformer(transformers=[
    ('tnf1', OrdinalEncoder(categories=[['High School', 'Bachelors', 'Masters', 'PhD']]), ['Education_Level']),
    ('tnf2', OneHotEncoder(drop='first', sparse_output=False), ['Country', 'Gender', 'Job_Role'])
], remainder='passthrough')



In [26]:
transformedData = transformed.fit_transform(df)

In [27]:
transformedData.shape

(100, 11)

In [28]:
transformedData

array([[1., 0., 1., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.],
       [2., 0., 0., ..., 0., 0., 1.]])

In [29]:
feature_names = transformed.get_feature_names_out()
feature_names

array(['tnf1__Education_Level', 'tnf2__Country_France',
       'tnf2__Country_Germany', 'tnf2__Country_UK', 'tnf2__Country_USA',
       'tnf2__Gender_Male', 'tnf2__Gender_Other', 'tnf2__Job_Role_Doctor',
       'tnf2__Job_Role_Engineer', 'tnf2__Job_Role_Lawyer',
       'tnf2__Job_Role_Scientist'], dtype=object)

In [30]:
trasnformed_Df = pd.DataFrame(transformedData,columns = feature_names)

In [31]:
trasnformed_Df

Unnamed: 0,tnf1__Education_Level,tnf2__Country_France,tnf2__Country_Germany,tnf2__Country_UK,tnf2__Country_USA,tnf2__Gender_Male,tnf2__Gender_Other,tnf2__Job_Role_Doctor,tnf2__Job_Role_Engineer,tnf2__Job_Role_Lawyer,tnf2__Job_Role_Scientist
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,3.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,3.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
95,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
96,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
97,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
98,3.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
