Created by

Asif Newaz

www.asifnewaz.com

# Function for minority-class instance categorization

In [19]:
# -*- coding: utf-8 -*-
"""
Created on Wed May  1 01:52:45 2024

@author: asifn
"""
# this function divides minority class instances into four categories: Safe, Border, Rare, and Outlier.

from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd


def categorize_minority_class(data, n_neighbors=6):
    
    
    X = data.drop(data.columns[-1],axis=1)
    y = data[data.columns[-1]]
    ynp= np.array(y)
    
    knn = NearestNeighbors(n_neighbors=n_neighbors)
    knn.fit(X)
    
    minority_indices = np.where(y == 1)[0]
    distances, indices = knn.kneighbors(X.iloc[minority_indices,:])
    
    ysum=  np.sum(ynp[indices]==0, axis=1)
    
    out=[]
    for i in ysum:
        if i <2:
            out.append('s')
        elif i==2 or i==3:
            out.append('b')
        elif i==4:
            out.append('r')
        elif i>4:
            out.append('o')
            
    num_count= {'Safe': out.count('s'),
                'Border': out.count('b'),
                'Rare': out.count('r'),
                'Outlier': out.count('o')
               }
    return out, num_count

# Use case on a real-world dataset

Test on poker-86 dataset

In [8]:
data = pd.read_csv('poker_86.csv')
data

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,Class
0,4,5,1,9,3,5,4,9,2,9,0
1,4,11,4,5,2,11,3,11,1,5,0
2,3,1,4,1,3,4,1,4,2,1,0
3,3,5,2,5,3,1,2,1,4,1,0
4,4,13,2,13,4,7,3,7,1,13,0
...,...,...,...,...,...,...,...,...,...,...,...
1472,4,10,2,11,3,11,3,10,1,10,0
1473,1,4,3,3,4,4,1,3,2,4,0
1474,1,10,3,3,3,10,2,10,4,3,0
1475,1,6,2,6,4,12,3,6,1,12,0


In [9]:
data['Class'].value_counts()

0    1460
1      17
Name: Class, dtype: int64

So, there are 10 features. The data is quite imbalanced with only 17 instances in the minority class.

In [20]:
categorize_minority_class(data)

(['r',
  'o',
  'o',
  'o',
  'o',
  'o',
  'r',
  'o',
  'r',
  'o',
  'o',
  'o',
  'r',
  'o',
  'o',
  'r',
  'o'],
 {'Safe': 0, 'Border': 0, 'Rare': 5, 'Outlier': 12})

So, the minority-class instances in the data are divided with 5 instance belonging to rare category and 12 in outlier.

# Another example

In [21]:
data = pd.read_csv('ecoli3.csv')
data

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,Class
0,0.68,0.49,1.00,0.5,0.62,0.55,0.28,0
1,0.75,0.84,0.48,0.5,0.35,0.52,0.33,0
2,0.52,0.44,0.48,0.5,0.37,0.36,0.42,0
3,0.87,0.49,0.48,0.5,0.61,0.76,0.79,1
4,0.41,0.51,0.48,0.5,0.58,0.20,0.31,0
...,...,...,...,...,...,...,...,...
331,0.64,0.81,0.48,0.5,0.37,0.39,0.44,0
332,0.71,0.71,0.48,0.5,0.68,0.43,0.36,0
333,0.53,0.38,0.48,0.5,0.44,0.26,0.36,0
334,0.86,0.39,0.48,0.5,0.59,0.89,0.90,1


In [22]:
data['Class'].value_counts()

0    301
1     35
Name: Class, dtype: int64

In [23]:
categorize_minority_class(data)

(['s',
  'b',
  's',
  's',
  'b',
  'b',
  'b',
  'o',
  'b',
  'b',
  's',
  'b',
  'o',
  'b',
  'r',
  's',
  'r',
  'b',
  's',
  'b',
  'b',
  'o',
  'b',
  's',
  'b',
  'b',
  'r',
  'b',
  's',
  'b',
  'b',
  's',
  'b',
  's',
  'b'],
 {'Safe': 10, 'Border': 19, 'Rare': 3, 'Outlier': 3})

In [24]:
_, x= categorize_minority_class(data)

In [25]:
x

{'Safe': 10, 'Border': 19, 'Rare': 3, 'Outlier': 3}