In [1]:
import pandas as pd
import numpy as np

# Sample dataset with missing values, duplicates, and outliers
data = {
    'Age': [25, 30, np.nan, 40, 150, 60, 22, 1000, 30, 25],
    'Salary': [50000, 60000, 55000, 70000, 1000000, 75000, 80000, 100000, 60000, 50000],
    'Experience': [1, 2, 3, 4, 50, 6, 1, 100, np.nan, 2]
}

# Convert the dictionary into a pandas DataFrame
df = pd.DataFrame(data)

# Display the original dataset
print("Original DataFrame:")
print(df)

# Step 1: Handle Missing Values
def handle_missing_values(df):
    # Fill numeric columns with the mean
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Experience'] = df['Experience'].fillna(df['Experience'].mean())
    
    # Fill categorical columns (if any) with the mode (not applicable here)
    # Example: df['Column_Name'] = df['Column_Name'].fillna(df['Column_Name'].mode()[0])
    
    return df

# Step 2: Remove Duplicates
def remove_duplicates(df):
    df = df.drop_duplicates()  # Remove duplicate rows based on all columns
    return df

# Step 3: Handle Outliers using IQR method
def handle_outliers(df):
    for column in df.select_dtypes(include=[np.number]).columns:  # Process only numeric columns
        # Calculate Q1 (25th percentile) and Q3 (75th percentile)
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        
        # Calculate the IQR
        IQR = Q3 - Q1
        
        # Calculate the lower and upper bounds
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Replace outliers with the median of the respective column
        df[column] = df[column].apply(lambda x: df[column].median() if x < lower_bound or x > upper_bound else x)
    
    return df

# Step 4: Apply all data cleaning steps
def clean_data(df):
    # Handle missing values
    df = handle_missing_values(df)
    
    # Remove duplicates
    df = remove_duplicates(df)
    
    # Handle outliers
    df = handle_outliers(df)
    
    return df

# Clean the dataset
cleaned_df = clean_data(df)

# Display the cleaned dataset
print("\nCleaned DataFrame:")
print(cleaned_df)

Original DataFrame:
      Age   Salary  Experience
0    25.0    50000         1.0
1    30.0    60000         2.0
2     NaN    55000         3.0
3    40.0    70000         4.0
4   150.0  1000000        50.0
5    60.0    75000         6.0
6    22.0    80000         1.0
7  1000.0   100000       100.0
8    30.0    60000         NaN
9    25.0    50000         2.0

Cleaned DataFrame:
          Age    Salary  Experience
0   25.000000   50000.0    1.000000
1   30.000000   60000.0    2.000000
2  153.555556   55000.0    3.000000
3   40.000000   70000.0    4.000000
4  150.000000   65000.0    3.500000
5   60.000000   75000.0    6.000000
6   22.000000   80000.0    1.000000
7   35.000000  100000.0    3.500000
8   30.000000   60000.0   18.777778
9   25.000000   50000.0    2.000000


In [None]:
# Question: Data Transformation Techniques
# Objective: Transform skewed data using log transformation.
# Description: Perform a log transformation to handle skewness in a dataset, which is particularly useful for
# certain machine learning models.



In [None]:
# Question: Feature Engineering by Creating New Features
# Objective: Create a new feature based on existing features to add predictive power.
# Description: Generate additional features from existing data to potentially improve the performance of
# prediction models.




In [None]:
# Question: Handling Complex Outliers with Z-Scores
# Objective: Detect and handle outliers using Z-score method.
# Description: Use the Z-score method to identify outliers which significantly differ from the rest of the data points.




In [None]:
# Question: Data Imputation with K-Nearest Neighbors (KNN)
# Objective: Impute missing numerical values using the KNN method.
# Description: Use the K-nearest neighbors algorithm to fill in missing values, which considers the values of
# nearest neighbors for imputation.


