### Load Libraries

In [17]:
import scipy.io
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import itertools
import pyod
from pathlib import Path
from pyod.models.iforest import IForest
import string

### Load File

In [2]:
path_data = Path("C:/Users/Florentina/Documents/Uni CSS/3. Semester/kddm2/Data/letter-recognition.data")

header = list(range(1, 17))
header = ["letter"] + header

letter_df_complete = pd.read_csv(path_data, 
sep = ",", 
names = header)

### Data Preparation

In [3]:
from sklearn.model_selection import train_test_split

def prepare_dataset(letter: str, dataset: pd.DataFrame =letter_df_complete, test_prob: float = 0.2, 
    rand_state: int = 0) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Filters dataset for specific letter and splits into training and testing dataset.
    
    Args:
        letter (str): letter of interest.
        dataset (pd.DataFrame): dataframe containing the data.
        test_prob (float): percentage of data reserved for testing.
        rand_stat (int): random state for splitting the dataset.
        
    Returns:
        pd.DataFrame: dataframe filtered on letter.
        pd.DataFrame: dataframe for training.
        pd.DataFrame: dataframe for testing.
    """
    #Filter for the letter
    dataset_letter = dataset[dataset["letter"] == letter]

    #Create a copy of the dataset without the letter
    dataset_wo_let = dataset_letter.drop('letter', axis = 1)

    #Split the dataset into training and test data
    letter_train, letter_test = train_test_split(dataset_wo_let, test_size = test_prob, random_state = rand_state)

    return dataset_letter, letter_train, letter_test

### Statistics

In [7]:
def var_importance(iso_for: pyod.models.iforest, let_train: pd.DataFrame):
    """
    Determines the features' importance (https://towardsdatascience.com/use-the-isolated-forest-with-pyod-3818eea68f08).
    
    Args:
        iso_for(pyod.models.iforest): isolation forest.
        let_train(pd.DataFrame): training dataset.
        
    Returns:
        Any.
    """
    isft_vi = iso_for.feature_importances_

    for_plot = pd.DataFrame({'x_axis':let_train.columns,
              'y_axis':isft_vi}).sort_values(by='y_axis',ascending=True)
    for_plot['y_axis'].plot.barh()

def descriptive_stat_threshold(dataset: pd.DataFrame, pred_score: np.ndarray, threshold: float) -> pd.DataFrame:
    """
    Calculates the statistics (https://towardsdatascience.com/use-the-isolated-forest-with-pyod-3818eea68f08).
    
    Args:
        dataset (pd.DataFrame): (training/testing) dataset.
        pred_score (np.ndarray): predictive scores from the isolation forest.
        threshold (float): threshold for classifying a datapoint as an outlier.
        
    Returns:
        pd.DataFrame: dataframe containinf the statistics.
    """

    dataset['Anomaly_Score'] = pred_score
    dataset['Group'] = np.where(dataset['Anomaly_Score']< threshold, 'Normal', 'Outlier')

    # Calculate statistics:
    cnt = dataset.groupby('Group')['Anomaly_Score'].count().reset_index().rename(columns={'Anomaly_Score':'Count'})
    cnt['Count %'] = (cnt['Count'] / cnt['Count'].sum()) * 100
    stat = dataset.groupby('Group').mean().round(2).reset_index()
    stat = cnt.merge(stat, left_on='Group',right_on='Group')

    return (stat)


### Plotting

In [8]:
def plot_scores(dataset_scores: np.ndarray):
    """
    Plots scores (https://towardsdatascience.com/use-the-isolated-forest-with-pyod-3818eea68f08).
    
    Args:
        numpy.ndarray: numpy array containing scores.
        
    Returns:
        Any.
    """
    plt.hist(dataset_scores, bins='auto')
    plt.title("Outlier score")
    plt.show()

def plot_results(results_df: pd.DataFrame):
    """
    Creates a plot for the results.
    
    Args:
        results_df (pd.DataFrame): DataFrame containing the results.
        
    Returns:
        Any.
    """
    print(results_df.head())
    print(results_df['% Outliers'].describe())

    plt.hist(results_df['% Outliers'])
    plt.show()

    fig, ax = plt.subplots(figsize=(12, 6))

    x = np.arange(0, 900, 1)
    y = results_df['% Outliers']

    ax.plot(y, color='blue', label='Outliers')

    plt.ylim([9, 11])
    plt.show()

    plt.boxplot(results_df['% Outliers'])
    plt.show()

### Models

In [9]:
from pyod.models.iforest import IForest

def isolation_forest(train_data: pd.DataFrame, test_data: pd.DataFrame, cont: float = 0.05, max_feat: int = 1.0, 
    max_samp: int = 40, n_est: int = 100, random_state: int = 0) -> tuple[pyod.models.iforest, np.ndarray, np.ndarray]:
    """
    Creates a new isolation forest.
    
    Args:
        train_data (pd.DataFrame): dataframe for training.
        test_data (pd.DataFrame): dataframe for testing.
        cont (float): contamination (estimated % of outliers).
        max_feat (int): features to train the isolation forest.
        max_samp (int): samples to train the isolations forest (impacts the tree size).
        n_est (int): numbers of trees in the ensemble.
        rand_stat (int): random state.
        
    Returns:
        pyod.models.iforest: isolation forest.
        numpy.ndarray: numpy array containing training scores.
        numpy.ndarray: numpy array containing testing scores.
    """
    #Create a new iForest
    isft = IForest(behaviour='new', contamination=cont, max_features=max_feat, max_samples=max_samp, n_estimators=n_est)

    #Fit iForest
    isft.fit(train_data)

    #Training data
    y_train_scores = isft.decision_function(train_data)
    y_train_pred = isft.predict(train_data)

    #Test data
    y_test_scores = isft.decision_function(test_data)
    y_test_pred = isft.predict(test_data)

    return isft, y_train_scores, y_test_scores

### Algorithm

In [46]:
def algorithm(dataset_complete: pd.DataFrame = letter_df_complete, cont: float = 0.05, max_feat: int = 1.0, 
    max_samp: int = 40 , n_est: int = 100, random_state: int = 0, write_to_excel: bool = True):
    """
    Performs all necessary steps for determining outliers.
    
    Args:
        dataset_complete (pd.DataFrame): complete dataset.
        cont (float): contamination (estimated % of outliers).
        max_feat (int): features to train the isolation forest.
        max_samp (int): samples to train the isolations forest (impacts the tree size).
        n_est (int): numbers of trees in the ensemble.
        rand_state (int): random state.
        write_to_excel (bool): boolean variable if results should be written to excel.
        
    Returns:
        Any.
    """
    # Create dataframe for results
    alphabet = list(string.ascii_uppercase)
    results = pd.DataFrame(columns = ['letter', 'datapoints','outlier_abs', 'outlier_rel'])
    results['letter'] = alphabet

    # Loop though all letters
    for letter in alphabet:

        # Perform the isolation forest
        data_letter, letter_train, letter_test = prepare_dataset(letter, dataset_complete)
        iso_for, y_train, y_test = isolation_forest(letter_train, letter_test, cont, max_feat, 
            max_samp, n_est, random_state)
        threshold_outlier = iso_for.threshold_
        stats_df = descriptive_stat_threshold(letter_test, y_test, threshold_outlier)
        absolute = stats_df[stats_df['Group'] == 'Outlier']['Count']
        percentage = stats_df[stats_df['Group'] == 'Outlier']['Count %']
        total_data = stats_df['Count'].sum()

        results.loc[(results.letter == letter), 'datapoints'] = total_data
        results.loc[(results.letter == letter), 'outlier_abs'] = absolute
        results.loc[(results.letter == letter), 'outlier_rel'] = percentage

    exsheet_name = str(cont) + "_" + str(max_feat) + "_" + str(max_samp) + "_" + str(n_est) + "_" + str(random_state)
    if write_to_excel == True
        with pd.ExcelWriter("C:/Users/Florentina/Documents/Uni CSS/3. Semester/kddm2/Data/Results_Iso_For.xlsx", mode = 'a') as writer:
            results.to_excel(writer, sheet_name=exsheet_name)

    print(results)


### Incorporating Cross Validation

In [48]:
hyper_params = {
    'cont': [0.01, 0.025, 0.05, 0.075, 0.1],
    'max_feat': [1, 8, 16],
    'max_samp': [10, 50, 100],
    'max_est': [10, 25, 50, 100, 250],
    'random_state': [0, 1, 2, 3, 4, 5]
}

def cross_validation(params: dict = hyper_params, dataset_complete: pd.DataFrame = letter_df_complete, 
    write_to_excel = True):
    """
    Performs cross validation.
    
    Args:
        params (dict): dicitionary containing all hyperparamters.
        dataset_complete (pd.DataFrame): complete dataset.
        write_to_excel (bool): boolean variable if results should be written to excel.
        
    Returns:
        Any.
    """
    
    a = params.values()
    combinations = list(itertools.product(*a))
    for c in combinations:
        algorithm(letter_df_complete, cont=c[0], max_feat=c[1], max_samp=c[2], n_est=c[3], random_state=c[4])


In [49]:
cross_validation()

   letter datapoints outlier_abs outlier_rel
0       A        158         NaN         NaN
1       B        154         NaN         NaN
2       C        148         NaN         NaN
3       D        161         NaN         NaN
4       E        154         NaN         NaN
5       F        155         NaN         NaN
6       G        155         NaN         NaN
7       H        147         NaN         NaN
8       I        151         NaN         NaN
9       J        150         NaN         NaN
10      K        148         NaN         NaN
11      L        153         NaN         NaN
12      M        159         NaN         NaN
13      N        157         NaN         NaN
14      O        151         NaN         NaN
15      P        161         NaN         NaN
16      Q        157         NaN         NaN
17      R        152         NaN         NaN
18      S        150         NaN         NaN
19      T        160         NaN         NaN
20      U        163         NaN         NaN
21      V 