In [1]:
# Question 1: Handling Missing Values with Conditional Filling
# Description: Fill missing values in a specific column based on a condition from another column.
import pandas as pd
import numpy as np

# Example DataFrame
data = {
    'Age': [25, 30, np.nan, 40, np.nan, 35, np.nan],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male']
}

df = pd.DataFrame(data)

# Step 1: Calculate mean age for each gender group
mean_ages = df.groupby('Gender')['Age'].mean()

# Step 2: Fill missing 'Age' based on 'Gender'
def fill_age(row):
    if pd.isna(row['Age']):
        return mean_ages[row['Gender']]  # Use the mean age based on gender
    else:
        return row['Age']

df['Age'] = df.apply(fill_age, axis=1)

# Display the DataFrame after filling missing values
print(df)


    Age  Gender
0  25.0    Male
1  30.0  Female
2  25.0    Male
3  40.0  Female
4  25.0    Male
5  35.0  Female
6  25.0    Male


In [3]:
# Question 2: Removing Outliers by Rescaling
# Descripimport pandas as pd
import numpy as np

# Example DataFrame with numerical data
data = {
    'Value': [10, 12, 13, 15, 14, 100, 16, 14, 13, 12, 15, 11, 200]
}

df = pd.DataFrame(data)

# Step 1: Calculate Z-scores
mean = df['Value'].mean()  # Mean of the 'Value' column
std_dev = df['Value'].std()  # Standard deviation of the 'Value' column

df['Z-score'] = (df['Value'] - mean) / std_dev  # Z-score formula

# Step 2: Remove outliers (Z-score > 3 or Z-score < -3)
threshold = 3
df_no_outliers = df[abs(df['Z-score']) <= threshold]

# Display the DataFrame before and after removing outliers
print("Original DataFrame:")
print(df)

print("\nDataFrame after removing outliers:")
print(df_no_outliers)



Original DataFrame:
    Value   Z-score
0      10 -0.438079
1      12 -0.401920
2      13 -0.383840
3      15 -0.347681
4      14 -0.365761
5     100  1.189070
6      16 -0.329602
7      14 -0.365761
8      13 -0.383840
9      12 -0.401920
10     15 -0.347681
11     11 -0.419999
12    200  2.997014

DataFrame after removing outliers:
    Value   Z-score
0      10 -0.438079
1      12 -0.401920
2      13 -0.383840
3      15 -0.347681
4      14 -0.365761
5     100  1.189070
6      16 -0.329602
7      14 -0.365761
8      13 -0.383840
9      12 -0.401920
10     15 -0.347681
11     11 -0.419999
12    200  2.997014


In [4]:
# Question 3: Applying Data Type Conversion
# Description: Convert the 'Age' column to integers after filling missing values.

import pandas as pd
import numpy as np

# Sample DataFrame with missing 'Age' values
data = {
    'Age': [25, 30, np.nan, 40, np.nan, 35, np.nan],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male']
}

df = pd.DataFrame(data)

# Step 1: Fill missing values in the 'Age' column with the mean age
mean_age = df['Age'].mean()
df['Age'] = df['Age'].fillna(mean_age)

# Step 2: Convert the 'Age' column to integers
df['Age'] = df['Age'].astype(int)

# Display the DataFrame after filling missing values and converting to integers
print(df)

   Age  Gender
0   25    Male
1   30  Female
2   32    Male
3   40  Female
4   32    Male
5   35  Female
6   32    Male


In [None]:
# Question 4: Automating Data Cleaning with Functions
# Description: Create a function that automates the process of filling missing values, removing duplicates, and standardizing column names.



In [5]:
# Question 5: Complex Data Normalization
# Description: Normalize a numeric column to a range using min-max scaling.
import pandas as pd
import numpy as np

# Sample DataFrame with missing values and duplicates
data = {
    'First Name': ['Alice', 'Bob', 'Alice', 'Charlie', np.nan],
    'Age': [25, np.nan, 25, 30, 22],
    'Gender': ['Female', 'Male', 'Female', 'Male', 'Female']
}

df = pd.DataFrame(data)

# Function to clean the DataFrame
def clean_data(df, fill_value=None, columns_to_fill=None):
    """
    Cleans the DataFrame by filling missing values, removing duplicates, and standardizing column names.
    
    Parameters:
    df (DataFrame): The DataFrame to clean.
    fill_value (any): Value to fill missing values (optional). Default is None.
    columns_to_fill (list): List of column names to apply filling (optional).
    
    Returns:
    DataFrame: The cleaned DataFrame.
    """
    
    # Step 1: Standardize column names (convert to lowercase and replace spaces with underscores)
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    
    # Step 2: Fill missing values (for specified columns or all columns)
    if columns_to_fill is None:
        df.fillna(fill_value, inplace=True)  # Fill all missing values with fill_value
    else:
        for col in columns_to_fill:
            df[col].fillna(fill_value, inplace=True)
    
    # Step 3: Remove duplicates
    df.drop_duplicates(inplace=True)
    
    return df

# Use the function to clean the DataFrame
df_cleaned = clean_data(df, fill_value=30, columns_to_fill=['Age'])

# Display the cleaned DataFrame
print(df_cleaned)


KeyError: 'Age'