1. Imports 

In [1]:
import pandas as pd 
import numpy as np


2. Load DataSet 

In [2]:
df = pd.read_csv('employees.csv')
df

Unnamed: 0,Name,Age,Gender,Salary,Department
0,Ali,25.0,Male,50000.0,IT
1,Zara,30.0,Female,,HR
2,Usman,,Male,60000.0,Finance
3,Maria,28.0,Female,52000.0,
4,Ahmed,35.0,,58000.0,IT
5,Ayesha,22.0,Female,48000.0,Marketing


3. Handle Missing Values 

Detect Missing values

In [3]:
# missing values each column has
print(df.isnull().sum)

<bound method DataFrame.sum of     Name    Age  Gender  Salary  Department
0  False  False   False   False       False
1  False  False   False    True       False
2  False   True   False   False       False
3  False  False   False   False        True
4  False  False    True   False       False
5  False  False   False   False       False>


 Handle Missing Numerical Columns

In [4]:
# e.g Age, Salary with Mean and median
df['Age'] = df['Age'].fillna(df['Age'].mean())

df['Salary'] = df['Salary'].fillna(df['Salary'].median())

Handle Missing Categorical Columns

In [5]:
# Gender, Department with mode
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])

df['Department'] = df['Department'].fillna(df['Department'].mode()[0])

Drop Remaining Columns

In [6]:
df.dropna(inplace=True)
print('\n Missing values after Handling:')
print(df.isnull().sum())


 Missing values after Handling:
Name          0
Age           0
Gender        0
Salary        0
Department    0
dtype: int64


In [7]:
df

Unnamed: 0,Name,Age,Gender,Salary,Department
0,Ali,25.0,Male,50000.0,IT
1,Zara,30.0,Female,52000.0,HR
2,Usman,28.0,Male,60000.0,Finance
3,Maria,28.0,Female,52000.0,IT
4,Ahmed,35.0,Female,58000.0,IT
5,Ayesha,22.0,Female,48000.0,Marketing


4. Encoding Categorical Variables

In [8]:
# convert gender and department into numeric format
# convert text to numbers

df_encoded = pd.get_dummies(df, columns=['Gender', 'Department'], drop_first=True)
print(df_encoded)

     Name   Age   Salary  Gender_Male  Department_HR  Department_IT  \
0     Ali  25.0  50000.0         True          False           True   
1    Zara  30.0  52000.0        False           True          False   
2   Usman  28.0  60000.0         True          False          False   
3   Maria  28.0  52000.0        False          False           True   
4   Ahmed  35.0  58000.0        False          False           True   
5  Ayesha  22.0  48000.0        False          False          False   

   Department_Marketing  
0                 False  
1                 False  
2                 False  
3                 False  
4                 False  
5                  True  


5. Normalize or Standardize Numeric Features 

In [9]:
from sklearn.preprocessing import StandardScaler

# Select numeric columns only
numeric_cols = ['Age', 'Salary']
scaler = StandardScaler()

# Replace age and salary with scaled values
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])

6. Train-Test Split 

In [10]:
from sklearn.model_selection import train_test_split

# X and y Features
X = df_encoded.drop('Salary', axis=1)  #input columns (everything except salary)
y = df_encoded['Salary']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42
)

# Show Shapes
print("Training Set size", X_train.shape)
print("Test set size", X_test.shape)

Training Set size (4, 6)
Test set size (2, 6)


In [11]:
df_encoded.to_csv('cleaned_data.csv', index=False)