# Import 

In [1]:
# importing additional libraries

import sklearn
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.preprocessing import *
from imblearn.pipeline import Pipeline
from imblearn.metrics import *

In [25]:
from icost import iCost   # importing the iCost method
from mst_linked_ind import mst_instance_complexity, compute_mst_linked_indices   # importing helper function for MST implementation
from categorize_minority_v2 import categorize_minority_class    # importing helper function to check minority categorization

# Processing 

In [3]:
# scaling
sc= MinMaxScaler()

In [4]:
# performance measure
scores={'accuracy': make_scorer(accuracy_score),
        'recall' : make_scorer(recall_score),
        'specificity':make_scorer(recall_score,pos_label=0),
        'gmean': make_scorer(geometric_mean_score),
        'roc': make_scorer(roc_auc_score),
        'precision':make_scorer(precision_score),
        'mcc':make_scorer(matthews_corrcoef),
        'f1': make_scorer(f1_score)
        }

# Defining iCost algorithm

In [17]:
# default iCost (method = original CS implementation of sklearn)
clf0 = iCost(SVC())

# No cost
clf1 = iCost(SVC(), method="ncs")

# Standard sklearn-like
clf2 = iCost(SVC(), method="org")

# Neighbor, mode 1 => safe, pure, border category
clf3 = iCost(SVC(), method="neighbor", neighbor_mode=1)

# Neighbor, mode 2 => safe, border, outlier category
clf4 = iCost(SVC(), method="neighbor", neighbor_mode=2)

# Neighbor, mode 3 => applying cost individually
clf5 = iCost(SVC(), method="neighbor", neighbor_mode=3, neighbor_costs= [2, 2, 5, 7, 7, 2]) #the cost values are selected based on the IR of Ecoli dataset

In [26]:
#from mst_instance_complexity import compute_mst_linked_indices
linked = compute_mst_linked_indices(x, y, positive_label=1)

clf6 = iCost(
    base_classifier=SVC(),
    method="mst",
    index_vec=linked      # supply precomputed linked minority indices
)

# Testing on Ecoli dataset

## Data Preparation

In [6]:
data= pd.read_csv('ecoli2.csv')
data

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,Class
0,0.49,0.29,0.48,0.5,0.56,0.24,0.35,0
1,0.07,0.40,0.48,0.5,0.54,0.35,0.44,0
2,0.56,0.40,0.48,0.5,0.49,0.37,0.46,0
3,0.59,0.49,0.48,0.5,0.52,0.45,0.36,0
4,0.23,0.32,0.48,0.5,0.55,0.25,0.35,0
...,...,...,...,...,...,...,...,...
331,0.74,0.56,0.48,0.5,0.47,0.68,0.30,1
332,0.71,0.57,0.48,0.5,0.48,0.35,0.32,1
333,0.61,0.60,0.48,0.5,0.44,0.39,0.38,1
334,0.59,0.61,0.48,0.5,0.42,0.42,0.37,1


In [7]:
data['Class'].value_counts()

Class
0    284
1     52
Name: count, dtype: int64

In [8]:
# imbalance ratio 
Ir= 284/52
Ir

5.461538461538462

In [9]:
# preprocessing the data
x=data.iloc[:,:-1]
y=data.iloc[:,-1]
x= sc.fit_transform(x)

## categorization

In [10]:
min_idx, groups, opp_counts = categorize_minority_class(
    data,
    minority_label= 1,   # minority class = 1
    scale=True,          # z-score features before KNN (usually a good idea)
    mode=1,              # 1: (safe=0, pure=1–2, border=3–5)
    show_summary=True
    # mode=2,            # 2: (safe=0–1, border=2–4, outlier=5)
    # mode=3,            # 3: returns g1..g6 buckets
    # feature_cols=[...],# if you want to specify features explicitly
    # label_col="label", # if your label is not the last column
)

Category summary (minority samples):
  p: 19
  s: 26
  b: 7


In [13]:
print("Minority sample count:", len(min_idx))
print("groups:", groups)
#print("Opposite-neighbor counts (first 10):", opp_counts[:10])

Minority sample count: 52
groups: ['p', 's', 's', 'p', 'p', 's', 'p', 'p', 's', 's', 'p', 's', 's', 's', 'p', 's', 'b', 's', 'p', 's', 's', 's', 'b', 's', 'p', 's', 'b', 's', 's', 'p', 'b', 's', 's', 'p', 's', 's', 'p', 'p', 'p', 'p', 's', 'b', 'p', 'p', 's', 'b', 'b', 'p', 's', 's', 'p', 's']


In [14]:
opp_counts

array([1, 0, 0, 2, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 5, 0, 1, 0, 0, 0,
       3, 0, 1, 0, 3, 0, 0, 1, 5, 0, 0, 2, 0, 0, 1, 2, 1, 1, 0, 5, 2, 2,
       0, 3, 5, 1, 0, 0, 1, 0])

In [15]:
min_idx, groups, opp_counts = categorize_minority_class(
    data,
    scale=True,          # z-score features before KNN (usually a good idea)
    mode=2               # 2: (safe=0–1, border=2–4, outlier=5)
    #show_summary=True
    # mode=2,            # 2: (safe=0–1, border=2–4, outlier=5)
    # mode=3,            # 3: returns g1..g6 buckets
    # feature_cols=[...],# if you want to specify features explicitly
    # label_col="label", # if your label is not the last column
)

Category summary (minority samples):
  s: 40
  b: 8
  o: 4


## Results

In [18]:
score = cross_validate(clf0, x,y, cv=RepeatedStratifiedKFold(random_state=50), n_jobs=-1, scoring=scores)
df=pd.DataFrame(score)
df.mean()*100

fit_time             0.450384
score_time           2.153012
test_accuracy       94.075066
test_recall         92.345455
test_specificity    94.397243
test_gmean          93.278748
test_roc            93.371349
test_precision      76.547309
test_mcc            80.557388
test_f1             83.177029
dtype: float64

In [19]:
score = cross_validate(clf1, x,y, cv=RepeatedStratifiedKFold(random_state=50), n_jobs=-1, scoring=scores)
df=pd.DataFrame(score)
df.mean()*100

fit_time             0.311175
score_time           2.092422
test_accuracy       95.269974
test_recall         79.400000
test_specificity    98.171053
test_gmean          88.130358
test_roc            88.785526
test_precision      89.625974
test_mcc            81.491898
test_f1             83.718643
dtype: float64

In [20]:
score = cross_validate(clf2, x,y, cv=RepeatedStratifiedKFold(random_state=50), n_jobs=-1, scoring=scores)
df=pd.DataFrame(score)
df.mean()*100

fit_time             0.351019
score_time           1.987654
test_accuracy       94.075066
test_recall         92.345455
test_specificity    94.397243
test_gmean          93.278748
test_roc            93.371349
test_precision      76.547309
test_mcc            80.557388
test_f1             83.177029
dtype: float64

In [21]:
score = cross_validate(clf3, x,y, cv=RepeatedStratifiedKFold(random_state=50), n_jobs=-1, scoring=scores)
df=pd.DataFrame(score)
df.mean()*100

fit_time             0.727618
score_time           1.972428
test_accuracy       94.136084
test_recall         92.345455
test_specificity    94.469298
test_gmean          93.316690
test_roc            93.407376
test_precision      76.614852
test_mcc            80.638286
test_f1             83.260466
dtype: float64

In [22]:
score = cross_validate(clf4, x,y, cv=RepeatedStratifiedKFold(random_state=50), n_jobs=-1, scoring=scores)
df=pd.DataFrame(score)
df.mean()*100

fit_time             0.681502
score_time           1.954055
test_accuracy       95.029412
test_recall         92.345455
test_specificity    95.526316
test_gmean          93.845368
test_roc            93.935885
test_precision      80.074822
test_mcc            83.009689
test_f1             85.380859
dtype: float64

In [23]:
score = cross_validate(clf5, x,y, cv=RepeatedStratifiedKFold(random_state=50), n_jobs=-1, scoring=scores)
df=pd.DataFrame(score)
df.mean()*100

fit_time             0.719196
score_time           2.061048
test_accuracy       95.387621
test_recall         92.345455
test_specificity    95.948622
test_gmean          94.049102
test_roc            94.147038
test_precision      81.848782
test_mcc            84.127716
test_f1             86.329800
dtype: float64

In [27]:
score = cross_validate(clf6, x,y, cv=RepeatedStratifiedKFold(random_state=50), n_jobs=-1, scoring=scores)
df=pd.DataFrame(score)
df.mean()*100

fit_time             0.367602
score_time           2.113750
test_accuracy       94.373573
test_recall         92.345455
test_specificity    94.749373
test_gmean          93.454848
test_roc            93.547414
test_precision      77.628586
test_mcc            81.313980
test_f1             83.858874
dtype: float64