In [1]:
import sys; sys.path.append('../')

import time

from src.data_loader import load_data

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from multiprocessing.pool import Pool
from functools import partial

import warnings
warnings.filterwarnings('ignore')

import ipywidgets as widgets
from ipywidgets import interact, interact_manual



In [2]:
plt.rcParams["figure.figsize"] = (20,10)

# Minority class identification

In [3]:
clusters, _ = load_data('../data/huge_sample_input_classified.txt')

In [4]:
#Delete columns with one value
for col in clusters.columns:
    if len(clusters[col].unique()) == 1:
        clusters.drop(col,inplace=True,axis=1)

In [5]:
#Binarize
clusters['classification'] = (
    ((clusters['classification'] != 'Vegetation') & (clusters['classification'] != 'Limit_effect'))
)

In [6]:
def get_label(p):
    if p < 1/5:
        return 'Outlier'
    elif p < 2/5:
        return 'Rare'
    elif p < 4/5:
        return 'Border Line'
    return 'Safe'

def calculate_ordered_distances(i, df):
    element = df.loc[i]
    distances_class = []

    for _, target_element in df.loc[~clusters.index.isin([i])].iterrows():
        distance = np.linalg.norm(target_element.drop('classification') - element.drop('classification'))
        distances_class.append((distance, target_element['classification']))

    return sorted(distances_class, key=lambda p: p[0])


@interact
def show_types_of_minorities(k=(2, 50, 1)):
    global distances
    if not ('distances' in vars() or 'distances' in globals()):
        with Pool(3) as pool:
            distances = pool.map(
                partial(calculate_ordered_distances, df=clusters),
                clusters[clusters['classification'] == True].index
            )
        
    classes = [[e[1] for e in  d[:k]] for d in distances]
    element_types = [get_label(sum(l) / len(l)) for l in classes]
    element_types, counts = np.unique(element_types, return_counts=True)
    plt.pie(counts, labels=element_types, autopct='%1.1f%%')
    plt.legend()
    

interactive(children=(IntSlider(value=26, description='k', max=50, min=2), Output()), _dom_classes=('widget-in…

As the dataset is composed by almost all rare and outlier examples we must apply methods in the following table:

| PreProcess | Average Rank |
|------------|--------------|
| SMOTE      | 3.9          |
| SPIDER     | 3.8          |
| NCR        | 3.4          |

Extracted from _"Napierala 2015"_