In [None]:
import pandas as pd
import numpy as np
import sys
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif

#Function that fills the missing values with the mean of the variables
def fill_missing_values(df):
    features_df = df.iloc[:, :-1]
    target_df = df.iloc[:, -1]
    features_df = features_df.fillna(features_df.mean())
    df_filled = pd.concat([features_df, target_df], axis=1)
    return df_filled   

#Function that removes the duplicate instances
def remove_duplicates(data):
    df_without_duplicates = data.drop_duplicates()
    return df_without_duplicates

#Function that encodes categorical data into numerical
def encode_categorical_data(data):
    for column in data.columns[:-1]:
        if data[column].dtype == 'object':
            data[column] = pd.Categorical(data[column]).codes
    return data

#Function that removes the outliers
def remove_outliers_iqr(df, iqr_factor=1.5):
    conditions = []
    for col in df.columns[:-1]:
        lower_bound = df[col].quantile(5 / 100)
        upper_bound = df[col].quantile(95 / 100)
        condition = (df[col] < lower_bound) | (df[col] > upper_bound)
        conditions.append(condition)
        
    combined_condition = ~pd.concat(conditions, axis=1).any(axis=1)
    return df[combined_condition]

#Function that standardizes the data
def data_standardization(df):
    scaler = StandardScaler()
    for column in df.columns[:-1]:
        df[column] = scaler.fit_transform(df[[column]])[:, 0]
    return df

#Function that balances the data using the Synthetic over sampling technique (SMOTE)
def balance_with_smote(df):
    X = df.drop(df.columns[-1], axis=1)
    y = df[df.columns[-1]]
    smote = SMOTE()
    X_resampled, y_resampled = smote.fit_resample(X, y)
    df_balanced = pd.concat([pd.DataFrame(X_resampled), y_resampled], axis=1)
    df_balanced.columns = df.columns
    return df_balanced

#Function that performs filter based feature selection based the f-statistic and chooses the top 10 features
def feature_selection(training_data):
    constant_features = training_data.columns[training_data.nunique() == 1]
    training_data.drop(constant_features, axis=1, inplace=True)
    X = training_data.drop(training_data.columns[-1], axis=1)
    y = training_data[training_data.columns[-1]]
    f_values, p_values = f_classif(X, y)
    features_importance = pd.Series(f_values, index=X.columns)
    features_importance_sorted = features_importance.sort_values(ascending=False)
    best_features = features_importance_sorted.index[:10]
    return training_data[list(best_features) + [training_data.columns[-1]]]
        
#Function that fits the test features with the training features
def remove_unused_features(training_data, testing_data):
    training_cols = training_data.columns.tolist()
    cleaned_test_data = testing_data[training_cols]
    cleaned_test_data = cleaned_test_data[training_cols]
    return cleaned_test_data  
    
#Function for the preprocess of the data
def data_preprocess_for_the_supervised_algorithms(training_data, testing_data, flag):
    if flag == 1:
        training_data = fill_missing_values(training_data)
        training_data = remove_duplicates(training_data)
        training_data = encode_categorical_data(training_data)
        training_data = remove_outliers_iqr(training_data)
        training_data = data_standardization(training_data)
        training_data = balance_with_smote(training_data)
        training_data = feature_selection(training_data)
        return training_data  
    else:
        testing_data = fill_missing_values(testing_data)
        testing_data = remove_duplicates(testing_data)
        testing_data = encode_categorical_data(testing_data)
        testing_data = remove_outliers_iqr(testing_data)
        testing_data = data_standardization(testing_data)
        testing_data = remove_unused_features(training_data, testing_data)
        return testing_data