# 2.0 Data Preprocessing
This notebook allows to clean the dataset.

## Imports and loading
Import necessary packages and load the raw data.

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from dateutil import parser

In [None]:
# load csv file
df = pd.read_csv('../data/raw/<your_data>.csv')

## Delete Unwanted Features
If you already know you don't need certain features, remove them before the preprocessing.

In [None]:
# Drop columns by names
#columns_to_drop = ['<Column1>', '<Column3>']
#df = df.drop(columns=columns_to_drop, axis=1)

# Drop columns by index
#index = 0
#df = df.drop(df.columns[index], axis=1)

# Drop columns by index range
#start_index = 0
#end_index = 1
#df = df.drop(df.columns[start_index:end_index + 1], axis=1)

## Convert Categorical Features
The further preprocessing requires only numeric features (no strings). So convert all categorical to numeric features before continuing.

### Label Encoding
Label Encoding is suitable when the categorical values have an ordinal relationship, meaning there is a meaningful order among the categories. Each category is assigned a unique numerical label. The labels are often assigned in ascending order based on their alphabetical or numerical order.

In [None]:
# Define the columns which should be encoded
cols_cat = ['<Column 1>', '<Column 3>']


# Loop through each categorical column to perform label encoding
for i in cols_cat:
    # Step 1: Store the original column values
    original = df[i]

    # Step 2: Create a mask for missing values in the column
    mask = df[i].isnull()

    # Step 3: Perform label encoding on the column and replace the original values
    df[i] = LabelEncoder().fit_transform(df[i].astype(str))

    # Step 4: Replace the encoded values with original values for missing values
    df[i] = df[i].where(~mask, original)

    # Step 5: Convert the column back to integers, treating 'nan' as NaN
    df[i] = df[i].apply(lambda x: int(x) if str(x) != 'nan' else np.nan) 

### One Hot Encoding
One-Hot Encoding is suitable when the categorical values are nominal, meaning there is no inherent order among the categories. Each category is represented by a binary column (0 or 1) in a new matrix. The column corresponding to the category is marked with a 1, and others are marked with 0.

In [None]:
# Define and select the columns which should be encoded
cols_cat = ['<Column 1>', '<Column 3>']

# Define the One Hot Encoder
encoder = OneHotEncoder(drop='first', handle_unknown='ignore')

# Encode the selected columns
df_cat_encoded = encoder.fit_transform(df_cat.astype(str)).toarray()

# Save the result in a dataframe
df_encoded = pd.DataFrame(df_cat_encoded, index=df_cat.index, columns=encoder.get_feature_names_out(df_cat.columns))

# Delete the old features
df = df.drop(cols_cat, axis=1)

# Concat the onehot features back to the data 
df = pd.concat([df, df_encoded], axis=1)

### Date Encoding
Encode cyclic data using sine and cosine functions.

In [None]:
# Define and select the columns which should be encoded
#date_cols = ['<Column 1>']
date_cols = ['sepal.width']

for col in date_cols:
    # Parse the date format
    df[col] = df[col].apply(lambda x: parser.parse(x) if isinstance(x, str) else x)

    # Encode year linearly
    df[col + ' year'] = df[col].dt.year

    # Encode other components using sine and cosine functions
    components = ['month', 'day', 'hour', 'minute', 'second', 'microsecond']
    for comp in components:
        df[col + ' ' + comp + ' sin'] = np.sin(2 * math.pi * df[col].dt.__getattribute__(comp) / df[col].dt.__getattribute__(comp).max())
        df[col + ' ' + comp + ' cos'] = np.cos(2 * math.pi * df[col].dt.__getattribute__(comp) / df[col].dt.__getattribute__(comp).max())

# Remove the original date columns
df.drop(date_cols, axis=1, inplace=True)

### Already Numeric
Some columns consist already of numeric values. Just convert them to numeric values. If the decimal numbers use the german writing, replace the comma with points before converting. 

In [None]:
def convert_column_comma_and_set_type_float(col: pd.Series) -> pd.Series:
    """
    Converts a Pandas Series containing numeric strings with commas to float values.

    Parameters:
    - col: The input Pandas Series containing numeric strings.

    Returns:
    - The converted Pandas Series with values converted to float.
    """
    # Use the map function to apply the specified lambda function to each element in the column
    col = col.map(lambda x: x.replace('.', '0.0').replace(',', '.') if type(x) != float else x)

    # Convert the column to the float type
    col = col.astype(float)

    # Return the converted column
    return col

In [None]:
# Define the columns which should be encoded
cols_cat = ['<Column 1>', '<Column 3>']

# Loop through each selected categorical column
for i in cols_cat:
    # Check if the column contains '.' or ',' in any of its values
    if df[i].str.contains('.').any() or df[i].str.contains(',').any():
        # If yes, apply the custom function to convert the column to float
        df[i] = convert_column_comma_and_set_type_float(df[i])
    else:
        # If no '.', ',' found, use pd.to_numeric to convert the column to numeric
        df[i] = pd.to_numeric(df[i])

## Fill Missing Values
Further operations require a dataset without missing values. So fill all missing values before continuing.

In [None]:
# Define the imputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Apply the imputation to the dataset
df = df.transform(df)

## Detect and Remove Outlier
Outlier detection is important in various fields and applications because outliers, which are data points that significantly differ from the majority of the data, can have a significant impact on the analysis, interpretation and performance of statistical and machine learning models.

In [None]:
# Create an Isolation Forest outlier detector with 100 estimators
detector = IsolationForest(n_estimators=100)

# Fit the detector to the data and obtain outlier labels
out = pd.Series(detector.fit_predict(df), index=df.index)

# Identify outliers by mapping -1 labels to True, others to False
is_outlier = out.map(lambda x: x == -1)

# Create a new column 'is_outlier' in the original DataFrame to mark outliers
df_outlier["is_outlier"] = is_outlier

# Get the indices of the rows identified as outliers
indices = is_outlier.index[is_outlier == True]

# Drop rows identified as outliers from the original DataFrame
df = df.drop(indices)

## Save Preprocessed Dataset
Save the processed data in a new file. Rename if you need multiple files.

In [None]:
df.to_csv('../data/processed/processed.csv', index=False)