In [7]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
def IRLbl(dataset, label, labelset):
    ''' calculates the imbalance ratio per label
    needs dataset with dummy columns '''
    num = max(dataset.loc[:,labelset].sum())
    denom = dataset.loc[:,label].sum()
    return num/denom

In [3]:
def MeanIR(dataset, labelset):
    ''' calculates the mean imbalance ratio'''
    import numpy as np
    return np.mean([IRLbl(dataset, label, labelset) for label in labelset])

In [19]:
def MLROS(dataset, labels, percentage, batch_size = 100):
    from numpy import random
    starting_size = len(dataset)
    samplesToClone = int(len(dataset) * percentage / 100)
    mir = MeanIR(dataset, labels)
    cloners = [label for label in labels if IRLbl(dataset, label, labels) > mir]
    clone_sets = [dataset.loc[dataset[label] == 1].reset_index(drop = True) for label in cloners]
    clone_set_lengths = [len(x) for x in clone_sets]
    
    
    cloneCount = 0
    
    while(cloneCount < samplesToClone and len(cloners) > 0):
        clones = pd.DataFrame()
        for i,label in enumerate(cloners):
            clones = clones.append(clone_sets[i].loc[random.choice(range(clone_set_lengths[i]), batch_size, replace = True)])
        cloneCount += batch_size * len(cloners)
        print(str(cloneCount) + '/' + str(samplesToClone))
        
        dataset = dataset.append(clones, ignore_index = True)
                    
        for label in cloners:
            if IRLbl(dataset, label, labels) <= mir:
                idx = cloners.index(label)
                cloners = cloners[:idx] + cloners[idx+1:]
                clone_sets = clone_sets[:idx] + clone_sets[idx+1:]
                clone_set_lengths = clone_set_lengths[:idx] + clone_set_lengths[idx+1:]
        
    return dataset

In [9]:
articles = pd.read_csv('../data/arxiv_math.csv')

In [8]:
def get_math_categories(categories):
    import ast
    return [x[5:] for x in ast.literal_eval(categories) if x[:5] == 'math.']

In [10]:
articles['math_categories'] = articles.categories.apply(get_math_categories)

In [11]:
mlb = MultiLabelBinarizer()

In [12]:
articles = pd.concat([articles,pd.DataFrame(mlb.fit_transform(articles.math_categories),columns=mlb.classes_, index=articles.index)], axis = 1)

In [13]:
IRLbl(articles, 'KT', mlb.classes_)

13.635794743429287

In [14]:
IRLbl(articles, 'AG', mlb.classes_)

1.5697037805440295

In [15]:
MeanIR(articles, mlb.classes_)

6.282509649885996

In [20]:
clones = MLROS(articles, mlb.classes_, 25)

1000/96111
2000/96111
3000/96111
4000/96111
4900/96111
5800/96111
6700/96111
7600/96111
8500/96111
9300/96111
10100/96111
10900/96111
11700/96111
12500/96111
13300/96111
14100/96111
14900/96111
15700/96111
16500/96111
17300/96111
18100/96111
18900/96111
19600/96111
20300/96111
21000/96111
21700/96111
22400/96111
23000/96111
23500/96111
24000/96111
24500/96111
25000/96111
25500/96111
26000/96111
26500/96111
27000/96111
27500/96111
28000/96111
28400/96111
28800/96111
29200/96111
29600/96111
30000/96111
30400/96111
30800/96111
31200/96111
31600/96111
31900/96111
32200/96111
32500/96111
32800/96111
33100/96111
33400/96111
33700/96111
34000/96111
34300/96111
34600/96111
34800/96111
35000/96111
35200/96111
35400/96111
35600/96111
35800/96111
36000/96111
36200/96111
36400/96111
36500/96111
36600/96111
