In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

In [2]:
df = pd.read_csv('../data/train.csv')

In [None]:
df

In [26]:
mlb = MultiLabelBinarizer(classes = [
    'clear', 'cloudy', 'haze','partly_cloudy',
    'agriculture','artisinal_mine','bare_ground','blooming',
    'blow_down','conventional_mine','cultivation','habitation',
    'primary','road','selective_logging','slash_burn','water'
    ])

In [27]:
df_bin = mlb.fit_transform(df['tags'].str.split(" "))

In [None]:
df_bin

In [None]:
np.unique(df_bin, return_counts=True) # Numpy 1.13 will add an axis argument to numpy unique

In [None]:
ncols = df_bin.shape[1]
dtype = df_bin.dtype.descr * ncols
struct = df_bin.view(dtype)

uniq_labels, uniq_counts = np.unique(struct, return_counts=True)
uniq_labels = uniq_labels.view(df_bin.dtype).reshape(-1, ncols)

In [None]:
uniq_labels

In [None]:
uniq_counts

In [None]:
df_stats = pd.DataFrame({'tags':mlb.inverse_transform(uniq_labels),'freq':uniq_counts})

In [None]:
df_stats

In [None]:
from torch.utils.data.sampler import Sampler
class SubsetSequentialSampler(Sampler):
     """Samples elements sequentially from a given list of indices, always in the same order.
 
     Arguments:
         indices (list): a list of indices
     """
 
     def __init__(self, indices):
         self.num_samples = len(indices)
         self.indices = indices
 
     def __iter__(self):
         return iter(self.indices)
 
     def __len__(self):
         return self.num_samples

In [None]:
class WeightedRandomSampler(Sampler):
    """Samples elements from [0,..,len(weights)-1] with given probabilities (weights).
    Arguments:
        weights (list)   : a list of weights, not necessary summing up to one
        num_samples (int): number of samples to draw
    """

    def __init__(self, weights, num_samples, replacement=True):
        self.weights = torch.DoubleTensor(weights)
        self.num_samples = num_samples
        self.replacement = replacement

    def __iter__(self):
        return iter(torch.multinomial(self.weights, self.num_samples, self.replacement))

    def __len__(self):
        return self.num_samples

In [None]:
np.apply_along_axis(np.array_str, 1, uniq_labels)

In [None]:
df['tags_encode'] = np.apply_along_axis(
                        np.array_str,
                        1,
                        mlb.transform(df['tags'].str.split(" "))
                        )

In [None]:
df

In [None]:
df_stats['tags_encode'] = np.apply_along_axis(
                        np.array_str,
                        1,
                        uniq_labels)
                        

In [None]:
df_stats

In [None]:
df

In [None]:
df_freq = df.merge(df_stats,how='left',on='tags_encode')

In [None]:
import torch

In [None]:
weights = 1 / df_freq['freq'].astype(np.float)

In [None]:
weights

In [None]:
df_freq.iloc[:split]

In [None]:
length = len(df_freq.index)
indices = list(range(1,length))

In [None]:
split = floor(0.2 * length)

In [None]:
split

In [None]:
import random

In [None]:
random.shuffle(indices)

In [None]:
weights[indices[:split]] = 0

In [None]:
weights[weights==0]

In [7]:
def balance_weights(df_source, col_target, mlb):
    """ Compute balanced weights from a Multilabel dataframe
    
    Arguments:
        Dataframe
        The name of the column with the target labels
        A MultiLabelBinarizer to one-hot-encode/decode the label column
        
    Returns:
        A Pandas Series with balanced weights
    """
    
    # Create a working copy of the dataframe
    df = df_source.copy(deep=True)
    
    df_labels = mlb.transform(df[col_target].str.split(" "))
    
    ## Next 4 lines won't be needed when axis argument is added to np.unique in NumPy 1.13
    ncols = df_labels.shape[1]
    dtype = df_labels.dtype.descr * ncols
    struct = df_labels.view(dtype)
    uniq_labels, uniq_counts = np.unique(struct, return_counts=True)
    
    uniq_labels = uniq_labels.view(df_labels.dtype).reshape(-1, ncols)
    
    ## We convert the One-Hot-Encoded labels as string to store them in a dataframe and join on them
    df_stats = pd.DataFrame({
        'target':np.apply_along_axis(np.array_str, 1, uniq_labels),
        'freq':uniq_counts
    })
    
    df['target'] = np.apply_along_axis(np.array_str, 1, df_labels)
    
    ## Join the dataframe to add frequency
    df = df.merge(df_stats,how='left',on='target')
    
    ## Compute balanced weights
    weights = 1 / df['freq'].astype(np.float)
    
    return weights

In [12]:
weights = balance_weights(df,'tags',mlb)

In [9]:
df

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road
5,train_5,haze primary water
6,train_6,agriculture clear cultivation primary water
7,train_7,haze primary
8,train_8,agriculture clear cultivation primary
9,train_9,agriculture clear cultivation primary road


In [14]:
np.unique(weights, return_counts=True)

(array([  7.33352889e-05,   2.75558005e-04,   5.08130081e-04,
          5.40540541e-04,   5.95238095e-04,   6.15006150e-04,
          6.80735194e-04,   8.42459983e-04,   8.88888889e-04,
          1.40449438e-03,   1.44508671e-03,   1.68067227e-03,
          1.89753321e-03,   1.98412698e-03,   2.11864407e-03,
          2.22717149e-03,   2.57069409e-03,   3.18471338e-03,
          3.20512821e-03,   3.54609929e-03,   3.89105058e-03,
          4.03225806e-03,   4.16666667e-03,   4.25531915e-03,
          4.40528634e-03,   5.12820513e-03,   5.15463918e-03,
          5.81395349e-03,   5.91715976e-03,   6.21118012e-03,
          6.89655172e-03,   7.09219858e-03,   7.35294118e-03,
          7.69230769e-03,   7.93650794e-03,   8.13008130e-03,
          8.69565217e-03,   8.92857143e-03,   9.43396226e-03,
          9.52380952e-03,   9.70873786e-03,   9.80392157e-03,
          1.08695652e-02,   1.12359551e-02,   1.16279070e-02,
          1.36986301e-02,   1.38888889e-02,   1.44927536e-02,
        

In [15]:
13636+3629+1968+1850+1680+1626+1469+1187+1125

28170

In [16]:
weights * 10000

0            6.807352
1           14.044944
2            0.733353
3            0.733353
4            8.888889
5           31.847134
6           44.052863
7            6.807352
8            8.424600
9           25.706941
10       10000.000000
11           5.405405
12           5.081301
13           0.733353
14           5.081301
15           0.733353
16           0.733353
17           2.755580
18           0.733353
19           5.952381
20          14.044944
21          19.841270
22           2.755580
23           5.952381
24         833.333333
25           0.733353
26          38.910506
27          51.282051
28           0.733353
29           2.755580
             ...     
40449        5.405405
40450       81.300813
40451        0.733353
40452        0.733353
40453      666.666667
40454        0.733353
40455    10000.000000
40456        0.733353
40457      666.666667
40458        5.952381
40459        5.405405
40460        0.733353
40461        5.081301
40462        0.733353
40463     

In [19]:
np.unique(np.clip(weights,0.02,0.2), return_counts=True)

(array([ 0.02      ,  0.02173913,  0.02272727,  0.02439024,  0.02564103,
         0.02631579,  0.02777778,  0.02941176,  0.03030303,  0.03125   ,
         0.03225806,  0.03333333,  0.03448276,  0.03703704,  0.03846154,
         0.04      ,  0.04166667,  0.04347826,  0.04761905,  0.05      ,
         0.05263158,  0.05882353,  0.0625    ,  0.06666667,  0.07142857,
         0.07692308,  0.08333333,  0.09090909,  0.1       ,  0.11111111,
         0.125     ,  0.14285714,  0.16666667,  0.2       ]),
 array([38252,    46,    44,   164,    39,    38,    36,    68,    33,
           64,    31,    30,    29,    81,    26,    25,    48,    23,
           42,    20,    38,    51,    64,   120,    28,    78,    48,
           66,    70,    81,    72,    70,    48,   506]))

In [18]:
mlb.classes_

array(['agriculture', 'artisinal_mine', 'bare_ground', 'blooming',
       'blow_down', 'clear', 'cloudy', 'conventional_mine', 'cultivation',
       'habitation', 'haze', 'partly_cloudy', 'primary', 'road',
       'selective_logging', 'slash_burn', 'water'], dtype=object)

In [28]:
df_bin

array([[0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       ..., 
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [29]:
df_bin.sum(axis=0)

array([28203,  2330,  2695,  7251, 12338,   339,   859,   332,    98,
         100,  4477,  3662, 37840,  8076,   340,   209,  7262])