Import Required Libraries & Load the Dataset

In [None]:
import pandas as pd
import numpy as np
from tabulate import tabulate

# Define file path
file_path = r"C:\Users\Mohamed Salah\Desktop\Final_Project\Employee.csv"

# Load dataset with correct encoding
df = pd.read_csv(file_path, encoding="utf-8")

# Display basic information about the dataset
print("Dataset Overview:")
print(df.info())  # Shows column names, data types, and missing values
print(df.head())  # Shows the first few rows of the dataset

# Display general information about the dataset, including:
print(df.info())

# Generate a summary report of dataset statistics
def data_summary(df):
    print("Dataset Summary Report")
    print("-" * 40)
    print(f"Total Rows: {df.shape[0]}")
    print(f"Total Columns: {df.shape[1]}")
    print("\nMissing Values per Column:\n", df.isnull().sum())
    print("\nData Types:\n", df.dtypes)
    print("-" * 40)

Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   EmployeeID               1470 non-null   object
 1   FirstName                1470 non-null   object
 2   LastName                 1470 non-null   object
 3   Gender                   1470 non-null   object
 4   Age                      1470 non-null   int64 
 5   BusinessTravel           1470 non-null   object
 6   Department               1470 non-null   object
 7   DistanceFromHome (KM)    1470 non-null   int64 
 8   State                    1470 non-null   object
 9   Ethnicity                1470 non-null   object
 10  Education                1470 non-null   int64 
 11  EducationField           1470 non-null   object
 12  JobRole                  1470 non-null   object
 13  MaritalStatus            1470 non-null   object
 14  Salary                

Handle Missing Values & Standardize Data Types

In [5]:
# Check for missing values before handling
print("Missing values before handling:")
print(df.isnull().sum())

# Check for missing values
print("Checking for missing values")
missing_values = df.isnull().sum()

# If there are no missing values, print a confirmation message
if missing_values.sum() == 0:
    print("No missing values found in the dataset.")
else:
    print("Missing values found:")
    print(missing_values)

Missing values before handling:
EmployeeID                 0
FirstName                  0
LastName                   0
Gender                     0
Age                        0
BusinessTravel             0
Department                 0
DistanceFromHome (KM)      0
State                      0
Ethnicity                  0
Education                  0
EducationField             0
JobRole                    0
MaritalStatus              0
Salary                     0
StockOptionLevel           0
OverTime                   0
HireDate                   0
Attrition                  0
YearsAtCompany             0
YearsInMostRecentRole      0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
dtype: int64
Checking for missing values
No missing values found in the dataset.


Remove Duplicate Rows & Standardize Column Names

In [None]:
# Remove duplicate rows
# Check for duplicate rows
print("\nChecking for duplicate rows...")
duplicate_rows = df.duplicated().sum()

if duplicate_rows == 0:
    print("No duplicate rows found.")
else:
    print(f"Number of duplicate rows found: {duplicate_rows}")
    df.drop_duplicates(inplace=True)
    print("Duplicate rows removed.")

# Remove duplicate rows if found
df.drop_duplicates(inplace=True)

# Standardize column names: lowercase, replace spaces with underscores
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

print("Duplicates removed & column names standardized.")


Checking for duplicate rows...
No duplicate rows found.
✅ Duplicates removed & column names standardized.


Remove Outliers in Salary Column Using IQR Method

In [10]:
# Remove outliers in Salary column using IQR method

# Step 1 : Check if the 'salary' column exists in the DataFrame
if 'salary' in df.columns:
    
    # Step 2: Calculate the first quartile (Q1) and third quartile (Q3) for the 'salary' column
    Q1 = df['salary'].quantile(0.25)  # 25th percentile (lower quartile)
    Q3 = df['salary'].quantile(0.75)  # 75th percentile (upper quartile)

    # Step 3: Compute the Interquartile Range (IQR)
    # IQR measures the middle 50% of the data and helps identify outliers
    IQR = Q3 - Q1

    # Step 4: Set the lower and upper boundaries for acceptable values
    # Outliers are values outside these boundaries
    lower_bound = Q1 - 1.5 * IQR  # Lower limit (Q1 - 1.5 * IQR)
    upper_bound = Q3 + 1.5 * IQR  # Upper limit (Q3 + 1.5 * IQR)

    # Step 5: Filter the DataFrame to keep only rows where 'salary' is within the boundaries
    df = df[(df['salary'] >= lower_bound) & (df['salary'] <= upper_bound)]

    # Step 6: Print the range for reference
    print(f"Salary outliers removed. Valid range: {lower_bound} to {upper_bound}")

Salary outliers removed. Valid range: -104132.375 to 289768.625


In [11]:
# Save the cleaned dataset as a new file
df.to_csv('Cleaned_Employee.csv', index=False)