Using the Z-score (Standard Score)

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import zscore

# Sample dataset
data = {'values': [10, 12, 15, 14, 13, 120, 11, 9, 8, 200]}
df = pd.DataFrame(data)

# Compute Z-scores
df['z_score'] = zscore(df['values'])

# Identify outliers (absolute z-score > 3)
outliers = df[abs(df['z_score']) > 3]
print(outliers)


Using the IQR (Interquartile Range)

In [None]:
# Compute IQR
Q1 = df['values'].quantile(0.25)
Q3 = df['values'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df[(df['values'] < lower_bound) | (df['values'] > upper_bound)]
print(outliers)

Using Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.boxplot(x=df['values'])
plt.show()

Handling Outliers

Removing Outliers

In [None]:
df_no_outliers = df[(df['values'] >= lower_bound) & (df['values'] <= upper_bound)]
print(df_no_outliers)

Replacing Outliers (Winsorization)

In [None]:
df['values'] = np.where(df['values'] > upper_bound, upper_bound, df['values'])
df['values'] = np.where(df['values'] < lower_bound, lower_bound, df['values'])

Transforming Data (Log, Square Root, Box-Cox)

In [None]:
df['log_values'] = np.log(df['values'] + 1)  # Adding 1 to avoid log(0)

Imputing Missing Values for Outliers

In [None]:
df['values'] = df['values'].replace(outliers['values'], np.nan)  # Set outliers as NaN
df['values'] = df['values'].fillna(df['values'].median())  # Replace with median