# Task 1 Part 2: Data preperation
In this task we will use function to clean and fix the data!


In [None]:
from Validations import *
import pandas as pd
from ydata_profiling import ProfileReport
import warnings

warnings.filterwarnings(action='ignore', category=FutureWarning, module=r'seaborn|pandas')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
def remove_bad_samples(df, column_name):
    """
    Removes samples from the DataFrame where the specified column has null values, modifying the DataFrame in place.

    :param df: pandas DataFrame containing the dataset.
    :param column_name: String name of the column to check for null values.
    """
    if column_name in df.columns:
        initial_count = df.shape[0]
        df.drop(df[df[column_name].isna()].index, inplace=True)
        
        num_removed = initial_count - df.shape[0]
        if num_removed > 0:
            print(f"Removed {num_removed} samples with null values in '{column_name}'.")
        else:
            print(f"No null values found in '{column_name}'.")
    else:
        print(f"Column '{column_name}' does not exist in the DataFrame.")

def fix_vlues(df, fill_strategy):
    """
    Fill null values in the DataFrame df based on a given fill strategy, considering sex and age.

    :param df: pandas DataFrame containing the dataset.
    :param fill_strategy: Dictionary specifying how to fill nulls for each column.
    """
    # Define age groups for segmentation
    age_bins = [0, 18, 30, 40, 50, 60, 70, 80, 120]  # Example age groups
    df['age_group'] = pd.cut(df['age'], bins=age_bins)

    for column, strategy in fill_strategy.items():
        # Apply different strategies based on the strategy type
        if strategy in ["mean", "median"]:
            for (sex, age_group), group_df in df.groupby(['sex', 'age_group'], observed=True):
                if strategy == "mean":
                    value_to_fill = group_df[column].mean()
                elif strategy == "median":
                    value_to_fill = group_df[column].median()
                
                # Fill null values for the specific sex and age group
                df.loc[(df['sex'] == sex) & (df['age_group'] == age_group) & (df[column].isnull()), column] = value_to_fill

        elif strategy == "zero":
            df[column].fillna(0, inplace=True)
    
    # Clean up by removing the temporary 'age_group' column
    df.drop('age_group', axis=1, inplace=True)

def remove_negative_samples(df, column_name):
    """
    Removes samples from the DataFrame where the specified column has negative values, modifying the DataFrame in place.

    :param df: pandas DataFrame containing the dataset.
    :param column_name: String name of the column to check for negative values.
    """
    if column_name in df.columns:
        initial_count = df.shape[0]
        # Remove rows with negative values in the specified column in place
        df.drop(df[df[column_name] < 0].index, inplace=True)
        
        num_removed = initial_count - df.shape[0]
        if num_removed > 0:
            print(f"Removed {num_removed} samples with negative values in '{column_name}'.")
        else:
            print(f"No negative values found in '{column_name}'.")
    else:
        print(f"Column '{column_name}' does not exist in the DataFrame.")

def remove_samples_below_threshold(df, column_name, threshold):
    """
    Removes samples from the DataFrame where the value in the specified column is lower than the given threshold,
    modifying the DataFrame in place.

    :param df: pandas DataFrame containing the dataset.
    :param column_name: String name of the column to check values in.
    :param threshold: Numeric value representing the threshold below which samples will be removed.
    """
    if column_name in df.columns:
        initial_count = df.shape[0]
        # Remove rows with values below the threshold in the specified column in place
        df.drop(df[df[column_name] < threshold].index, inplace=True)
        
        num_removed = initial_count - df.shape[0]
        if num_removed > 0:
            print(f"Removed {num_removed} samples with values in '{column_name}' below {threshold}.")
        else:
            print(f"No samples with values in '{column_name}' below {threshold} found.")
    else:
        print(f"Column '{column_name}' does not exist in the DataFrame.")

def remove_outliers(df, column, method="IQR"):
    """
    Remove or cap outliers in a specified column based on the IQR method or a defined cap.

    :param df: pandas DataFrame containing the dataset.
    :param column: Column to check for outliers.
    :param method: Method to use for outlier detection ('IQR' for Interquartile Range).
    :param cap: Optional tuple (min, max) to cap values.
    """
    if method == "IQR":
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[column] = df[column].clip(lower_bound, upper_bound)


def remove_feature(df, column_name):
    """
    Remove a specified feature (column) from the DataFrame if it exists.

    :param df: pandas DataFrame from which to remove the column.
    :param column_name: String name of the column to be removed.
    """
    if column_name in df.columns:
        df.drop(column_name, axis=1, inplace=True)
        print(f"Column '{column_name}' has been removed.")
    else:
        print(f"Column '{column_name}' does not exist in the DataFrame.")

def convert_feature_to_numeric(df, column_name):
    """
    Convert a feature with boolean values or two categorical options into 0 and 1.

    :param df: pandas DataFrame containing the dataset.
    :param column_name: String name of the column to convert.
    """
    if column_name in df.columns:
        # Check if column is boolean, or has exactly two unique categories/values
        if df[column_name].dtype == 'bool':
            # Directly map boolean to 0 and 1
            df[column_name] = df[column_name].astype(int)
            print(f"Column '{column_name}' has been converted to numeric values.")
        elif df[column_name].nunique() == 2:
            # Convert two unique values/categories to 0 and 1 if not boolean but binary
            mapping = {v: i for i, v in enumerate(df[column_name].unique())}
            df[column_name] = df[column_name].map(mapping)
            print(f"Column '{column_name}' has been converted to numeric values using mapping: {mapping}")
        else:
            print(f"Column '{column_name}' is not of a type or does not have a value count that allows for direct conversion.")
    else:
        print(f"Column '{column_name}' does not exist in the DataFrame.")

In [None]:
df = pd.read_csv('data/salary-dataset.csv')
df.sample(10)

## Remove bad samples
Bad samples are usually the ones that are missing data in the "label" that we want to predict, making them useless for us.

There should be one feature that is what we wish to predict.</br>
Your task:</br>
Fill in the column name and run the functions.</br>el training.</p>
</details>


<details>
    <summary>Hint</summary>
    <p>
         What we are trying to do? what do we wish to predict?
     </p>
</details>

<details>
    <summary>Solution</summary>
    <p>
        remove_bad_samples(df,'income')
     </p>
</details>

In [None]:
df.info()

In [None]:
remove_bad_samples(df,'---')

In [None]:
df.info()

## Removing outliers
Outliers are data points that significantly differ from other observations. They can skew the results of data analysis and lead to misleading conclusions because they may indicate variability in the data, experimental errors, or a novelty not accounted for by the model. In machine learning, they can affect the training process, resulting in a model that doesn’t perform well with the typical data it encounters.

There are many ways to handle outliers, In this case, we will remove low/negative values and IQR. </br>
The IQR method uses the middle 50% range of data to create a “normal zone” and flags any points outside this zone as outliers. It's like drawing lines in the sand; anything outside those lines is considered unusual.</br>

There should be two features that have outlier data.</br>
Your task:</br>
Fill in the columns and run the functions.</br>

<details>
    <summary>Hint</summary>
    <p>
         One of the features shouldn't have negative values, and the other shouldn't have such low values.
     </p>
</details>

<details>
    <summary>Solution</summary>
    <p>
        remove_negative_samples(df, 'income')</br>
        remove_samples_below_threshold(df,'weight',20)</br>
        remove_outliers(df, 'income', method="IQR")</br>
        remove_outliers(df, 'weight', method="IQR")</br>
     </p>
</details>

In [None]:
plot_feature_distribution(df, '---')
plot_feature_distribution(df, '---')

In [None]:
remove_negative_samples(df, '---')
remove_samples_below_threshold(df,'---',20)
remove_outliers(df, '---', method="IQR")
remove_outliers(df, '---', method="IQR")

In [None]:
plot_feature_distribution(df, '---')
plot_feature_distribution(df, '---')

## Fix features with null

Null values in data can be problematic because they represent missing, unknown, or unrecorded information, which can lead to inaccurate analyses, biased results, and ultimately poor decision-making. They can also disrupt the performance of many machine learning algorithms, which require complete data sets to function correctly. </br></br>
Should be 4 features with null that need to be fixed - to fix we need to find the features and pick for each one of them how to handle and missing data.</br>
There are many ways to handle null values, in our case, because we have null value on numerical fileds, we will use math to fill in.

The strategies:
1. mean - put in the mean value of the feature based on sex and age.
2. meadian - put in the median value of the feature based on sex and age.
3. zero - put zero as the value.</br>

Your task:</br>
Add to the "fill_strategy" map the name of the features with null values as key, and the strategy as value.</br>
fill_strategy = {'feature_name':'strategy',...} </br>
<details>
    <summary>Hint</summary>
    <p>
        Should think of what is more reasanable and logical to put based on the feature!</br>
        You can't have 1.2 kids right?
     </p>
</details>

<details>
    <summary>Solution</summary>
    <p>
        fill_strategy = {'income': 'mean', "childrens":'median', "weight":'mean', "height":'mean'}
     </p>
</details>

In [None]:
df.info()

In [None]:
fill_strategy = {'---':'median', '---':'mean', '---':'mean'}
fix_vlues(df, fill_strategy)
check_nulls_and_info(df)

## Feature selection
Removing features, also known as feature selection, can improve a model's performance by eliminating irrelevant or redundant data, reducing overfitting, and making the model simpler and faster to run.

There should be 3 features that are irrelevant to our goal.
Your task:</br>
Fill in the columns and run the functions.</br>

<details>
    <summary>Hint</summary>
    <p>
         They are features that will not help us with the task, think if someone asked you "Guess my salary!" what information is useless from the feature?
     </p>
</details>

<details>
    <summary>Solution</summary>
    <p>
        remove_feature(df, 'name')</br>
        remove_feature(df, 'last_name')</br>
        remove_feature(df, 'id')</br>
     </p>
</details>


In [None]:
remove_feature(df, '---')
remove_feature(df, '---')
remove_feature(df, '---')
df.info()

## Features to numbers
Changing booleans (True/False values) to numeric values (like 0 and 1) is important, why?</br>
Some ML algorithms can only handle numerical values. Transforming booleans into numbers ensures that these algorithms can process the data without errors.

There should be 1 boolean feature.
<details>
    <summary>Solution</summary>
    <p>
        convert_feature_to_numeric(df, 'married')
     </p>
</details>

In [None]:
convert_feature_to_numeric(df, '---')