<a href="https://colab.research.google.com/github/muskanalirizvi/Practice-sklearn/blob/main/Column_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np

# Seed for reproducibility
np.random.seed(42)

# Define categories
genders = ['Male', 'Female']
educations = ['High School', 'Bachelor', 'Master', 'PhD']
departments = ['HR', 'Engineering', 'Marketing', 'Sales']

# Generate data
n_samples = 1000
data = {
    'Gender': np.random.choice(genders, n_samples, p=[0.5, 0.5]),
    'Age': np.random.randint(22, 60, n_samples).astype(float),
    'Education': np.random.choice(educations, n_samples, p=[0.2, 0.4, 0.3, 0.1]),
    'Salary': np.random.randint(30000, 120000, n_samples).astype(float),
    'Department': np.random.choice(departments, n_samples, p=[0.25, 0.35, 0.2, 0.2])
}

# Introduce missing values
for col in data:
    n_missing = np.random.randint(0, 50)
    missing_indices = np.random.choice(n_samples, n_missing, replace=False)
    for idx in missing_indices:
        data[col][idx] = np.nan

df = pd.DataFrame(data)


In [None]:
df.head()

Unnamed: 0,Gender,Age,Education,Salary,Department
0,Male,33.0,High School,117893.0,Engineering
1,Female,37.0,Bachelor,85388.0,HR
2,Female,45.0,Master,57691.0,Sales
3,Female,40.0,Master,107960.0,Marketing
4,Male,29.0,Bachelor,114388.0,Marketing


In [5]:
df.isnull().sum()

Gender         0
Age           38
Education      0
Salary        23
Department     0
dtype: int64

In [8]:
df['Gender'].value_counts()

Gender
Male      487
Female    483
nan        30
Name: count, dtype: int64

In [9]:
df['Education'].value_counts()

Education
Bachelor       384
Master         315
High School    205
PhD             95
nan              1
Name: count, dtype: int64

In [10]:
df['Department'].value_counts()

Department
Engineering    351
HR             260
Marketing      204
Sales          177
nan              8
Name: count, dtype: int64

In [13]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [34]:
transformed = ColumnTransformer(
    transformers=[
        ('tnf1', SimpleImputer(), [1, 3]),
        ('tnf2', OrdinalEncoder(categories=[['nan', 'High School', 'Bachelor', 'Master', 'PhD']]), [2]),
        ('tnf3', OneHotEncoder(drop='first', sparse_output=False), [0, 4])
    ],
    remainder='passthrough'
)


In [37]:
transformed_data = transformed.fit_transform(df)

In [41]:
column_name = transformed.get_feature_names_out()

In [42]:
Transformed_df = pd.DataFrame(transformed_data, columns =column_name )

In [44]:
Transformed_df

Unnamed: 0,tnf1__Age,tnf1__Salary,tnf2__Education,tnf3__Gender_Male,tnf3__Gender_nan,tnf3__Department_HR,tnf3__Department_Marketing,tnf3__Department_Sales,tnf3__Department_nan
0,33.0,117893.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,37.0,85388.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0
2,45.0,57691.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0
3,40.0,107960.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0
4,29.0,114388.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
995,51.0,68225.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0
996,53.0,42453.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0
997,29.0,48814.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0
998,31.0,48263.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0
