In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# -*- coding: utf-8 -*-
# Importing required Library
import numpy as np
import pandas as pd
import random
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors

In [15]:
def create_dataset(n_samples=1000):
    ''' 
    Create a unevenly distributed sample data set multilabel  
    classification using make_classification function
    
    args
    nsample: int, Number of sample to be created
    
    return
    X: pandas.DataFrame, feature vector dataframe with 10 features 
    y: pandas.DataFrame, target vector dataframe with 5 labels
    '''
    X, y = make_classification(n_classes=3, class_sep=2, 
                           weights=[0.1, 0.008, 0.9], n_informative=2, n_redundant=1, flip_y=0,
                           n_features=4, n_clusters_per_class=1, n_samples=n_samples, random_state=10)
    y = pd.get_dummies(y, prefix='class')
    return pd.DataFrame(X), y

In [43]:
X,y = create_dataset(200)

In [44]:
y.sum()

class_0     20
class_1      1
class_2    179
dtype: int64

In [45]:
from chest_xray_diagnosis.multi_smote import MultiSmote as mlsmote

smote = mlsmote()
new_x, new_y = smote.multi_smote(X, y)

The number of the unique samples from the minority class is small, cannot find neighbors for this minority class.
Aborting for class 1
The number of the unique samples from the minority class is small, cannot find neighbors for this minority class.
Aborting for class 1


In [46]:
new_x

Unnamed: 0,0,1,2,3
0,1.391309,-1.011854,0.467130,1.655687
1,2.123991,-0.143775,0.815549,2.635315
2,3.034725,-0.075184,0.455584,3.018921
3,1.808511,-0.385582,0.384815,1.918270
4,1.008723,1.734194,0.561137,1.434371
...,...,...,...,...
195,1.530626,0.762254,0.149734,1.438465
196,1.220917,-1.627273,0.209859,1.242505
197,2.079223,0.686660,-0.033533,1.704838
198,2.266577,0.839176,-2.845023,-1.095311


In [47]:
new_x.drop_duplicates()

Unnamed: 0,0,1,2,3
0,1.391309,-1.011854,0.467130,1.655687
1,2.123991,-0.143775,0.815549,2.635315
2,3.034725,-0.075184,0.455584,3.018921
3,1.808511,-0.385582,0.384815,1.918270
4,1.008723,1.734194,0.561137,1.434371
...,...,...,...,...
195,1.530626,0.762254,0.149734,1.438465
196,1.220917,-1.627273,0.209859,1.242505
197,2.079223,0.686660,-0.033533,1.704838
198,2.266577,0.839176,-2.845023,-1.095311


In [48]:
np.unique(new_x.index, return_counts=True)

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
        169, 170, 171, 172, 173, 174, 175, 176, 177

In [20]:
new_x[new_x.duplicated()]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9


In [18]:
new_y.index.unique()

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            490, 491, 492, 493, 494, 495, 496, 497, 498, 499],
           dtype='int64', length=500)

In [13]:
new_y.sum()

class_0    332
class_1    332
class_2    332
class_3    332
class_4    332
dtype: int64

In [10]:
y.sum(axis=0)

class_0    100
class_1     25
class_2    205
class_3      8
class_4    662
dtype: int64

In [11]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.700263,0.602477,-2.478509,0.433484,1.109780,-0.246891,0.525389,-1.902413,1.991422,-1.848972
1,2.267874,-0.350847,2.187252,-0.769722,0.601022,-0.234049,-1.844444,2.092082,2.299990,-2.585919
2,-1.207028,0.230167,-2.035403,-0.401339,0.595955,2.066251,-2.311038,-1.804673,2.197500,-2.106087
3,-1.326205,-1.417870,2.085587,-0.453313,0.457689,0.868153,0.784653,2.348514,2.546447,-2.829462
4,-0.781795,1.495530,-2.194913,-0.264658,0.906237,-0.633937,-0.907535,-2.706511,1.255595,-1.120307
...,...,...,...,...,...,...,...,...,...,...
995,-2.287050,-0.298746,1.586469,-0.464742,1.103363,1.050799,0.960210,-2.157864,2.600376,-2.887021
996,-0.027245,1.842409,1.793464,0.013021,0.980886,-0.037242,-0.309943,-2.104098,-2.168505,2.055803
997,0.833553,-0.130508,-1.954786,-0.074340,-0.410230,0.837448,-1.627014,-1.333642,2.935449,-2.876682
998,-1.088050,-0.213819,-2.207270,-1.106803,-1.096291,0.722919,0.833781,-2.433188,1.347741,-1.211900


In [None]:
def get_tail_label(df):
    """
    Give tail label colums of the given target dataframe
    
    args
    df: pandas.DataFrame, target label df whose tail label has to identified
    
    return
    tail_label: list, a list containing column name of all the tail label
    """
    columns = df.columns
    n = len(columns)
    irpl = np.zeros(n)
    for column in range(n):
        irpl[column] = df[columns[column]].value_counts()[1]
    irpl = max(irpl)/irpl
    mir = np.average(irpl)
    tail_label = []
    for i in range(n):
        if irpl[i] > mir:
            tail_label.append(columns[i])
    return tail_label

In [18]:


def create_dataset(n_sample=1000):
    ''' 
    Create a unevenly distributed sample data set multilabel  
    classification using make_classification function
    
    args
    nsample: int, Number of sample to be created
    
    return
    X: pandas.DataFrame, feature vector dataframe with 10 features 
    y: pandas.DataFrame, target vector dataframe with 5 labels
    '''
    X, y = make_classification(n_classes=5, class_sep=2, 
                           weights=[0.1,0.025, 0.205, 0.008, 0.9], n_informative=3, n_redundant=1, flip_y=0,
                           n_features=10, n_clusters_per_class=1, n_samples=1000, random_state=10)
    y = pd.get_dummies(y, prefix='class')
    return pd.DataFrame(X), y

def get_tail_label(df):
    """
    Give tail label colums of the given target dataframe
    
    args
    df: pandas.DataFrame, target label df whose tail label has to identified
    
    return
    tail_label: list, a list containing column name of all the tail label
    """
    columns = df.columns
    n = len(columns)
    irpl = np.zeros(n)
    for column in range(n):
        irpl[column] = df[columns[column]].value_counts()[1]
    irpl = max(irpl)/irpl
    mir = np.average(irpl)
    tail_label = []
    for i in range(n):
        if irpl[i] > mir:
            tail_label.append(columns[i])
    return tail_label

def get_index(df):
    """
    give the index of all tail_label rows
    args
    df: pandas.DataFrame, target label df from which index for tail label has to identified

    return
    index: list, a list containing index number of all the tail label
    """
    tail_labels = get_tail_label(df)
    index = set()
    for tail_label in tail_labels:
        sub_index = set(df[df[tail_label]==1].index)
        index = index.union(sub_index)
    return list(index)

def get_minority_instace(X, y):
    """
    Give minority dataframe containing all the tail labels
    
    args
    X: pandas.DataFrame, the feature vector dataframe
    y: pandas.DataFrame, the target vector dataframe
    
    return
    X_sub: pandas.DataFrame, the feature vector minority dataframe
    y_sub: pandas.DataFrame, the target vector minority dataframe
    """
    index = get_index(y)
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub

def nearest_neighbour(X):
    """
    Give index of 5 nearest neighbor of all the instance
    
    args
    X: np.array, array whose nearest neighbor has to find
    
    return
    indices: list of list, index of 5 NN of each element in X
    """
    nbs=NearestNeighbors(n_neighbors=5,metric='euclidean',algorithm='kd_tree').fit(X)
    euclidean,indices= nbs.kneighbors(X)
    return indices

def MLSMOTE(X,y, n_sample):
    """
    Give the augmented data using MLSMOTE algorithm
    
    args
    X: pandas.DataFrame, input vector DataFrame
    y: pandas.DataFrame, feature vector dataframe
    n_sample: int, number of newly generated sample
    
    return
    new_X: pandas.DataFrame, augmented feature vector data
    target: pandas.DataFrame, augmented target vector data
    """
    indices2 = nearest_neighbour(X)
    n = len(indices2)
    new_X = np.zeros((n_sample, X.shape[1]))
    target = np.zeros((n_sample, y.shape[1]))
    for i in range(n_sample):
        reference = random.randint(0,n-1)
        neighbour = random.choice(indices2[reference,1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val>2 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbour,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    new_X = pd.concat([X, new_X], axis=0)
    target = pd.concat([y, target], axis=0)
    return new_X, target

In [21]:
X, y = create_dataset()                     #Creating a Dataframe
X_sub, y_sub = get_minority_instace(X, y)   #Getting minority instance of that datframe
X_res,y_res =MLSMOTE(X_sub, y_sub, 1000)     #Applying MLSMOTE to augment the dataframe

In [22]:
y_res.sum(axis=0)

class_0      0.0
class_1    792.0
class_2      0.0
class_3    241.0
class_4      0.0
dtype: float64