# Imports

In [1]:
import os
import tempfile
import shutil
from IPython.display import clear_output
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# MedMNIST
import medmnist

In [2]:
np.random.seed(1)

# Anomaly Dectection Data

**Dataset Source:**

The data used is provided by Loghub, which maintains a collection of system logs that are freely accessible for AI-driven log analytics research . The logs are a combination of production data released from previous studies and real systems in their lab environment. The logs are not sanitized, anonymized, or modified in any way, wherever possible. These log datasets are freely available for research or academic work.

https://github.com/logpai/loghub/tree/master

**Dataset  1:**

_Android_

Loghub Description:

Android (https://www.android.com) is a popular open-source mobile operating system and has been used by many smart devices. However, Android logs are rarely available in public for research purposes. We provide some Android log files, which were collected by Android smartphones with heavily instrumented modules installed. The Android architecture comprises of five levels, including the Linux Kernel, Libraries, Application Framework, Android Runtime, and System Applications. We provide a sample log file printed by the Application Framework.

Training on logs: [Info (I), Debug (D), Verbose (V)]

Test detecting: [Warn (W), Error (E)]

https://github.com/logpai/loghub/blob/master/Android/Android_2k.log_structured.csv

**Dataset 2:**

_BGL_

Loghub Description:

BGL is an open dataset of logs collected from a BlueGene/L supercomputer system at Lawrence Livermore National Labs (LLNL) in Livermore, California, with 131,072 processors and 32,768GB memory. The log contains alert and non-alert messages identified by alert category tags. In the first column of the log, "-" indicates non-alert messages while others are alert messages. The label information is amenable to alert detection and prediction research. It has been used in several studies on log parsing, anomaly detection, and failure prediction.

Training on logs: [Info, Warning]

Test detecting: [Error, Fatal, Severe]

https://github.com/logpai/loghub/blob/master/BGL/BGL_2k.log_structured.csv

**Dataset 3:**

_Hadoop_

Loghub Description:

Hadoop is a big data processing framework that allows for the distributed processing of large data sets across clusters of computers using simple programming models.The logs are generated from a Hadoop cluster with 46 cores across five machines simulating both normal and abnormal cases with injected specific failures for two applications (WordCount & PageRank)

Training on logs: [Info, Warn]

Test detecting: [Error, Fatal]

https://github.com/logpai/loghub/blob/master/Hadoop/Hadoop_2k.log_structured.csv

In [3]:
def _download_data_from_github(
        url: str
    ):
    """
    Download anomaly dectection data from github

    Input:
        url: github url

    Output:
        datasets: dictionary of dataframes
    """

    # create temporary folder
    tmp_dir = tempfile.mkdtemp()
    # clone github repo to temporary folder
    !git clone $url $tmp_dir
    # store dataframes in a dictionary
    data_dictionary = {}
    # loop through all folders in the temporary folder and get all csv files ending with structured.csv
    for root, dirs, files in os.walk(tmp_dir):
        for file in files:
            if file.endswith('structured.csv'):
                log_name = file.split('_')[0].lower()
                data_dictionary[log_name] = pd.read_csv(os.path.join(root, file))
    # delete temporary folder
    shutil.rmtree(tmp_dir)

    clear_output()
    
    # return dictionary of dataframes
    return data_dictionary

########################################################################################################################

def _split_data(
        data_dictionary: dict
    ):
    """
    Split data into train and test sets

    Input:
        data: dictionary of dataframes

    Output:
        datasets: dictionary of train and test sets
    """

    data = data_dictionary.copy()
    # dictionary of train and test sets
    data_dictionary = {}
    # loop through datasets
    for d in ['android', 'bgl', 'hadoop']:
        df = data[d]
        # split into normal and anomaly data
        normal_data, anomaly_data = df[df.Level.isin(['INFO', 'WARN', 'WARNING', 'I', 'D', 'V'])]['Content'], df[~df.Level.isin(['INFO', 'WARN', 'WARNING', 'I', 'D', 'V'])]['Content']
        # test dataset is a 50/50 split of normal and anomaly data (50 observations each)
        normal_sample, anomaly_sample = normal_data.sample(50), anomaly_data.sample(50)
        xtest, ytest = pd.concat([normal_sample, anomaly_sample]), np.concatenate([np.zeros(50), np.ones(50)])
        # train dataset is the remaining normal data
        xtrain = normal_data.drop(normal_sample.index)
        # add train and test sets to dictionary
        data_dictionary[d] = {'xtrain': xtrain, 'xtest': xtest, 'ytest': ytest}
    # return dictionary of train and test sets
    return data_dictionary

########################################################################################################################

def _data_preprocessing(
        data_dictionary: dict,
    ):
    '''
    Simple data preprocessing pipeline

    Input:
        data_dictionary: dictionary of dataframes

    Output:
        data_dictionary: dictionary of preprocessed dataframes
    '''

    regex_patterns = {
        # CamelCase split
        r'(?<=[a-z0-9])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])': ' ',
        # remove non-alphanumeric characters
        r'[^a-zA-Z]':                                       ' ',
        # remove extra spaces
        r'\s+':                                             ' '
    }

    # loop through datasets xtrain and xtest
    for d in ['android', 'bgl', 'hadoop']:
        for t in ['xtrain', 'xtest']:
            # preprocess data
            data = data_dictionary[d][t]
            # regex
            for regex, replacement in regex_patterns.items():
                data = data.str.replace(regex, replacement, regex=True)
            # convert to lowercase
            data = data.str.lower()
            # strip leading and trailing spaces
            data = data.str.strip()
            # replace with preprocessed data
            data_dictionary[d][t] = data
    # return dictionary of preprocessed dataframes
    return data_dictionary

########################################################################################################################

def _tfidf_vectorizer(
        data_dictionary: dict,
        max_features: int = 1000
    ):
    '''
    TF-IDF Vectorizer

    Input:
        data_dictionary: dictionary of dataframes
        max_features: maximum number of features

    Output:
        data_dictionary: dictionary of dataframes with tfidf vectors
    '''
    # loop through datasets
    for d in ['android', 'bgl', 'hadoop']:
        # split into train and test sets
        x_train, x_test = data_dictionary[d]['xtrain'], data_dictionary[d]['xtest']
        # initialize tfidf vectorizer
        tfidf = TfidfVectorizer(max_features=max_features)
        # fit TfidfVectorizer
        tfidf.fit(x_train)
        # transform xtrain, xtest
        xtrain_tfidf = tfidf.transform(x_train).toarray()
        xtest_tfidf = tfidf.transform(x_test).toarray()
        # replace with tfidf vectors
        data_dictionary[d]['xtrain'], data_dictionary[d]['xtest'] = xtrain_tfidf, xtest_tfidf
    # return dictionary of dataframes
    return data_dictionary

########################################################################################################################

def _save_data(
        data_dictionary: dict
    ):
    '''
    Save dataframes to numpy files

    Input:
        data_dictionary: dictionary of dataframes

    Output:
        None
    '''

    # loop through datasets
    for d in ['android', 'bgl', 'hadoop']:
        np.savez(
            f'{d.title()}.npz',
            xtrain=data_dictionary[d]['xtrain'],
            xtest=data_dictionary[d]['xtest'],
            ytest=data_dictionary[d]['ytest']
        )

########################################################################################################################
    
def load_data_wrapper(
        github_url: str
    ):
    '''
    Load data from github url

    Input:
        github_url: github url

    Output:
        None
    '''

    # download data from github
    data_dictionary = _download_data_from_github(github_url)
    # split data into train and test sets
    data_dictionary = _split_data(data_dictionary)
    # preprocess data
    data_dictionary = _data_preprocessing(data_dictionary)
    # vectorize data
    data_dictionary = _tfidf_vectorizer(data_dictionary)
    # save data
    _save_data(data_dictionary)
    # print success message
    print('Data successfully loaded!')

In [4]:
load_data_wrapper('https://github.com/logpai/loghub.git')

Data successfully loaded!


# Image Classification Data

**Dataset Source:**

The data used is provided by MedMNIST v2, a comprehensive collection of standardized biomedical images. It encompasses 12 datasets for 2D and 6 for 3D images, pre-processed into 28 x 28 (2D) or 28 x 28 x 28 (3D) with corresponding classification labels. With 708,069 2D images and 9,998 3D images, it supports various classification tasks, from binary/multi-class to ordinal regression and multi-label, catering to biomedical image analysis, computer vision, and machine learning research and education.

https://medmnist.com/

**Dataset  1:**

_PneumoniaMNIST_

MedMNIST Description:

The PneumoniaMNIST is based on a prior dataset of 5,856 pediatric chest X-Ray images. The task is binary-class classification of pneumonia against normal. We split the source training set with a ratio of 9:1 into training and validation set and use its source validation set as the test set. The source images are gray-scale, and their sizes are (384−2,916)×(127−2,713). We center-crop the images and resize them into 1×28×28.

https://zenodo.org/records/6496656/files/pneumoniamnist.npz?download=1

**Dataset 2:**

_BreastMNIST_

MedMNIST Description:

The BreastMNIST is based on a dataset of 780 breast ultrasound images. It is categorized into 3 classes: normal, benign, and malignant. As we use low-resolution images, we simplify the task into binary classification by combining normal and benign as positive and classifying them against malignant as negative. We split the source dataset with a ratio of 7:1:2 into training, validation and test set. The source images of 1×500×500 are resized into 1×28×28.

https://zenodo.org/records/6496656/files/breastmnist.npz?download=1

**Dataset 3:**

_OrganCMNIST_

MedMNIST Description:

The OrganCMNIST is based on 3D computed tomography (CT) images from Liver Tumor Segmentation Benchmark (LiTS). It is renamed from OrganMNIST_Coronal (in MedMNIST v1) for simplicity. We use bounding-box annotations of 11 body organs from another study to obtain the organ labels. Hounsfield-Unit (HU) of the 3D images are transformed into gray-scale with an abdominal window. We crop 2D images from the center slices of the 3D bounding boxes in coronal views (planes). The images are resized into 1×28×28 to perform multi-class classification of 11 body organs. 115 and 16 CT scans from the source training set are used as training and validation set, respectively. The 70 CT scans from the source test set are treated as the test set.

https://zenodo.org/records/6496656/files/organcmnist.npz?download=1

In [5]:
def _download_data_from_medmnist(
        dataset_list: list
    ):
    '''
    Load datasets from MedMNIST

    Input:
        dataset_list: list of datasets to load

    Output:
        data_dictionary: dictionary of datasets
    '''

    data_dictionary = {}
    # loop through datasets
    for d in dataset_list:
        # initialize DataClass
        DataClass = getattr(medmnist, d)
        # download data
        train_dataset = DataClass(split='train',download=True)
        eval_dataset = DataClass(split='val', download=True)
        test_dataset = DataClass(split='test', download=True)
        # to numpy array
        x_train, x_val, x_test = train_dataset.imgs, eval_dataset.imgs, test_dataset.imgs
        y_train, y_val, y_test = train_dataset.labels, eval_dataset.labels, test_dataset.labels
        # grayscale conversion if necessary
        x_train = np.mean(x_train, axis=-1) if x_train.shape[-1] == 3 else x_train
        x_val = np.mean(x_val, axis=-1) if x_val.shape[-1] == 3 else x_val
        x_test = np.mean(x_test, axis=-1) if x_test.shape[-1] == 3 else x_test

        data_dictionary[d] = {'xtrain': x_train, 'xval': x_val, 'xtest': x_test, 'ytrain': y_train, 'yval': y_val, 'ytest': y_test}

    clear_output()

    return data_dictionary

########################################################################################################################

def _split_data(
        data_dictionary: dict
    ):
    '''
    Split data into train and test sets

    Input:
        data_dictionary: dictionary of dataframes

    Output:
        data_dictionary: dictionary of train and test sets
    '''

    data = data_dictionary.copy()
    # dictionary of train, val and test sets
    data_dictionary = {}
    # loop through datasets
    for d in data.keys():
        # concatenate X and y
        x = np.concatenate([data[d]['xtrain'], data[d]['xval'], data[d]['xtest']], axis=0)
        y = np.concatenate([data[d]['ytrain'], data[d]['yval'], data[d]['ytest']], axis=0)
        # split into train, val, test=100 images
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=100, stratify=y)
        x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train)
        # expand dimension (grayscale)
        # (channels=1, height=28, width=28)
        x_train = np.expand_dims(x_train, axis=1)
        x_val = np.expand_dims(x_val, axis=1)
        x_test = np.expand_dims(x_test, axis=1)
        # add train, val and test sets to dictionary
        data_dictionary[d] = {'xtrain': x_train, 'xval': x_val, 'xtest': x_test, 'ytrain': y_train, 'yval': y_val, 'ytest': y_test}
    # return dictionary of train and test sets
    return data_dictionary

########################################################################################################################

def _data_preprocessing(
        data_dictionary: dict,
    ):
    '''
    Simple data preprocessing pipeline to normalize pixel values

    Input:
        data_dictionary: dictionary of dataframes

    Output:
        data_dictionary: dictionary of preprocessed dataframes
    '''

    # loop through datasets
    for d in data_dictionary.keys():
        # normalize pixel values
        data_dictionary[d]['xtrain'] = data_dictionary[d]['xtrain'].astype('float32') / 255.
        data_dictionary[d]['xval'] = data_dictionary[d]['xval'].astype('float32') / 255.
        data_dictionary[d]['xtest'] = data_dictionary[d]['xtest'].astype('float32') / 255.
    # return dictionary of preprocessed dataframes
    return data_dictionary

########################################################################################################################

def _save_data(
        data_dictionary: dict
    ):
    '''
    Save dataframes to numpy (npz) files

    Input:
        data_dictionary: dictionary of dataframes

    Output:
        None
    '''

    # loop through datasets
    for d in data_dictionary.keys():
        np.savez(
            f'{d}.npz', xtrain=data_dictionary[d]['xtrain'],
            xval=data_dictionary[d]['xval'],
            xtest=data_dictionary[d]['xtest'],
            ytrain=data_dictionary[d]['ytrain'],
            yval=data_dictionary[d]['yval'],
            ytest=data_dictionary[d]['ytest']
        )

########################################################################################################################
            
def load_data_wrapper(
        dataset_list: list
    ):
    '''
    Load datasets from MedMNIST

    Input:
        dataset_list: list of datasets to load

    Output:
        None
    '''

    # download data from MedMNIST
    data_dictionary = _download_data_from_medmnist(dataset_list)
    # split data into train and test sets
    data_dictionary = _split_data(data_dictionary)
    # preprocess data
    data_dictionary = _data_preprocessing(data_dictionary)
    # save data
    _save_data(data_dictionary)
    # print success message
    print('Data successfully loaded!')

In [6]:
load_data_wrapper(['PneumoniaMNIST', 'BreastMNIST', 'OrganCMNIST', 'BloodMNIST', 'DermaMNIST'])

Data successfully loaded!
