In [176]:
import pandas as pd
data = {
    'Age': [25, 30, None, 40, 45, None, 30, 55],
    'Salary': [50000, 60000, 70000, None, 90000, 100000, 55000, None],
    'Department': ['HR', 'Tech', 'Tech', None, 'Finance', 'HR', 'Tech', 'Finance'],
    'Target': [0, 1, 1, 0, 0, 0, 1, 1]
}
df = pd.DataFrame(data)

Drop rows with missing values in the Department column.

Fill missing Age values with the median and Salary with the mean.

In [177]:
df

Unnamed: 0,Age,Salary,Department,Target
0,25.0,50000.0,HR,0
1,30.0,60000.0,Tech,1
2,,70000.0,Tech,1
3,40.0,,,0
4,45.0,90000.0,Finance,0
5,,100000.0,HR,0
6,30.0,55000.0,Tech,1
7,55.0,,Finance,1


In [178]:
df.isna().sum()

Age           2
Salary        2
Department    1
Target        0
dtype: int64

In [179]:
df.dropna(subset='Department', inplace=True)

In [180]:
df

Unnamed: 0,Age,Salary,Department,Target
0,25.0,50000.0,HR,0
1,30.0,60000.0,Tech,1
2,,70000.0,Tech,1
4,45.0,90000.0,Finance,0
5,,100000.0,HR,0
6,30.0,55000.0,Tech,1
7,55.0,,Finance,1


In [181]:
df['Age']=df['Age'].fillna(df['Age'].median())

In [182]:
df

Unnamed: 0,Age,Salary,Department,Target
0,25.0,50000.0,HR,0
1,30.0,60000.0,Tech,1
2,30.0,70000.0,Tech,1
4,45.0,90000.0,Finance,0
5,30.0,100000.0,HR,0
6,30.0,55000.0,Tech,1
7,55.0,,Finance,1


In [183]:
df['Salary']=df['Salary'].fillna(round(df['Salary'].mean(),2))

In [184]:
df

Unnamed: 0,Age,Salary,Department,Target
0,25.0,50000.0,HR,0
1,30.0,60000.0,Tech,1
2,30.0,70000.0,Tech,1
4,45.0,90000.0,Finance,0
5,30.0,100000.0,HR,0
6,30.0,55000.0,Tech,1
7,55.0,70833.33,Finance,1


Encode Categorical Features

Perform One-Hot Encoding on the Department column.

In [185]:
df

Unnamed: 0,Age,Salary,Department,Target
0,25.0,50000.0,HR,0
1,30.0,60000.0,Tech,1
2,30.0,70000.0,Tech,1
4,45.0,90000.0,Finance,0
5,30.0,100000.0,HR,0
6,30.0,55000.0,Tech,1
7,55.0,70833.33,Finance,1


In [186]:
df['Department'].value_counts()

Department
Tech       3
HR         2
Finance    2
Name: count, dtype: int64

In [187]:
df=pd.concat([df, pd.get_dummies(df['Department'], drop_first=True).astype(int)], axis=1)

In [188]:
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder()
encoded=encoder.fit_transform(df[['Department']]).toarray()
pd.DataFrame(encoded, columns=encoder.get_feature_names_out())

Unnamed: 0,Department_Finance,Department_HR,Department_Tech
0,0.0,1.0,0.0
1,0.0,0.0,1.0
2,0.0,0.0,1.0
3,1.0,0.0,0.0
4,0.0,1.0,0.0
5,0.0,0.0,1.0
6,1.0,0.0,0.0


In [189]:
df

Unnamed: 0,Age,Salary,Department,Target,HR,Tech
0,25.0,50000.0,HR,0,1,0
1,30.0,60000.0,Tech,1,0,1
2,30.0,70000.0,Tech,1,0,1
4,45.0,90000.0,Finance,0,0,0
5,30.0,100000.0,HR,0,1,0
6,30.0,55000.0,Tech,1,0,1
7,55.0,70833.33,Finance,1,0,0


In [190]:
df.drop(columns='Department', inplace=True)

In [191]:
df

Unnamed: 0,Age,Salary,Target,HR,Tech
0,25.0,50000.0,0,1,0
1,30.0,60000.0,1,0,1
2,30.0,70000.0,1,0,1
4,45.0,90000.0,0,0,0
5,30.0,100000.0,0,1,0
6,30.0,55000.0,1,0,1
7,55.0,70833.33,1,0,0


Outlier Treatment

Use the IQR method to detect and cap outliers in the Salary column.

In [192]:
Q1=df['Salary'].quantile(0.25)
Q3=df['Salary'].quantile(0.75)

IQR=Q3-Q1

lower_fence=Q1-1.5*IQR
upper_fence=Q3+1.5*IQR

In [193]:
lower_fence, upper_fence

(23125.002499999988, 114791.66250000002)

In [194]:
import numpy as np

In [195]:
df['salary_capped']=np.where(df['Salary']<lower_fence, lower_fence, np.where(df['Salary']>upper_fence, upper_fence, df.Salary))

In [196]:
df

Unnamed: 0,Age,Salary,Target,HR,Tech,salary_capped
0,25.0,50000.0,0,1,0,50000.0
1,30.0,60000.0,1,0,1,60000.0
2,30.0,70000.0,1,0,1,70000.0
4,45.0,90000.0,0,0,0,90000.0
5,30.0,100000.0,0,1,0,100000.0
6,30.0,55000.0,1,0,1,55000.0
7,55.0,70833.33,1,0,0,70833.33


4. Data Interpolation

Use linear interpolation to fill missing values in the Age column.

Handling Imbalanced Data

Apply SMOTE to balance the Target column (assume Target is the class label).

In [197]:
df

Unnamed: 0,Age,Salary,Target,HR,Tech,salary_capped
0,25.0,50000.0,0,1,0,50000.0
1,30.0,60000.0,1,0,1,60000.0
2,30.0,70000.0,1,0,1,70000.0
4,45.0,90000.0,0,0,0,90000.0
5,30.0,100000.0,0,1,0,100000.0
6,30.0,55000.0,1,0,1,55000.0
7,55.0,70833.33,1,0,0,70833.33


In [198]:
import pandas as pd
data = {
    'Age': [25, 30, None, 40, 45, None, 30, 55],
    'Salary': [50000, 60000, 70000, None, 90000, 100000, 55000, None],
    'Department': ['HR', 'Tech', 'Tech', None, 'Finance', 'HR', 'Tech', 'Finance'],
    'Target': [0, 1, 1, 0, 0, 0, 1, 1]
}
df = pd.DataFrame(data)

In [199]:
df

Unnamed: 0,Age,Salary,Department,Target
0,25.0,50000.0,HR,0
1,30.0,60000.0,Tech,1
2,,70000.0,Tech,1
3,40.0,,,0
4,45.0,90000.0,Finance,0
5,,100000.0,HR,0
6,30.0,55000.0,Tech,1
7,55.0,,Finance,1


In [200]:
x=df['Age']

In [201]:
df['Age']=df['Age'].interpolate(method='linear')

In [202]:
df

Unnamed: 0,Age,Salary,Department,Target
0,25.0,50000.0,HR,0
1,30.0,60000.0,Tech,1
2,35.0,70000.0,Tech,1
3,40.0,,,0
4,45.0,90000.0,Finance,0
5,37.5,100000.0,HR,0
6,30.0,55000.0,Tech,1
7,55.0,,Finance,1


In [203]:
df['Target'].value_counts()

Target
0    4
1    4
Name: count, dtype: int64

In [204]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: C:\Users\91830\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [205]:
from imblearn.over_sampling import SMOTE
X = df.drop('Target', axis=1)
y = df['Target']
smote = SMOTE()
X_res, y_res = smote.fit_resample(X, y)

ValueError: could not convert string to float: 'HR'