In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import tqdm

'''

Marginal distribution weight shifting

'''

approx_freqs = {
    'g_99' : 0.002350,
    'g_90' : 0.001841,
    'eg_99' : 0.091153,
    'eg_90' : 0.086091,
    'g_95' : 0.001949,
    'g_92' : 0.002357,
    'eg_95' : 0.090235,
    'eg_92' : 0.000005,
    'g_88' : 0.002583,
    'g_67' : 0.002014,
    'eg_88' : 0.082226,
    'eg_67' : 0.083775,
    'g_65' : 0.030996,
    'g_64' : 0.001612,
    'eg_65' : 0.003594,
    'eg_64' : 0.094024,
    'g_62' : 0.001978,
    'g_53' : 0.002321,
    'eg_62' : 0.090185,
    'eg_53' : 0.000003,
    'g_52' : 0.002201,
    'g_42' : 0.002337,
    'eg_52' : 0.083214,
    'eg_42' : 0.082840,
    'g_16' : 0.030782,
    'g_15' : 0.001621,
    'eg_16' : 0.003579,
    'eg_15' : 0.093990,
    'g_6' : 0.024859,
    'eg_6' : 0.003284,
}

# Load submission
sub = pd.read_csv('../subs/sub_v3_isband_0.7691.csv').values

# Load rs_bin info
rs_bins = np.load('../data/rs_bins.npy')

In [2]:
def build_prior_table(approx_freqs_dict):  
    classes = np.sort(np.unique(np.array([int(s.split('_')[1]) for s in approx_freqs_dict.keys()])))
    lookup = {c : i for i,c in enumerate(classes)}
    
    ptable = np.zeros((classes.shape[0], 2)) 
    for k,v in approx_freqs_dict.items():
        class_ = int(k.split('_')[1])
        bin_str = k.split('_')[0]
        bin_index = 0 if bin_str == 'g' else 1
        row_index = lookup[class_]
        ptable[row_index,bin_index] = v
        
    # Up until here ptable denotes p(class=ci AND bin=gi). We must compute ptable p(class=ci | bin=gi). Thus:
    ptable /= np.sum(ptable, axis=0)
    
    return ptable

np.set_printoptions(precision=6, suppress=True)
prior_table = build_prior_table(approx_freqs)
np.sum(prior_table)

2.0

In [3]:
# Build submission table
def build_sub_table(sub_, rs_bins_):
    return np.vstack([
        np.mean(sub_[rs_bins_==0,1:], axis=0),
        np.mean(sub_[rs_bins_!=0,1:], axis=0),
    ]).T

sub_table = build_sub_table(sub, rs_bins)
np.sum(sub_table)

1.9999999992224073

In [4]:
classes = np.expand_dims(np.array([6,15,16,42,52,53,62,64,65,67,88,90,92,95,99]), 1).astype(int)
np.hstack([classes, sub_table, prior_table])

array([[ 6.      ,  0.021896,  0.000935,  0.22235 ,  0.003697],
       [15.      ,  0.00268 ,  0.088455,  0.014499,  0.105821],
       [16.      ,  0.26784 ,  0.000941,  0.275328,  0.00403 ],
       [42.      ,  0.001669,  0.174097,  0.020903,  0.093267],
       [52.      ,  0.001194,  0.024371,  0.019687,  0.093689],
       [53.      ,  0.002861,  0.000914,  0.02076 ,  0.000003],
       [62.      ,  0.001664,  0.051544,  0.017692,  0.101537],
       [64.      ,  0.001371,  0.009386,  0.014418,  0.105859],
       [65.      ,  0.228594,  0.000911,  0.277243,  0.004046],
       [67.      ,  0.001369,  0.02333 ,  0.018014,  0.09432 ],
       [88.      ,  0.001184,  0.032452,  0.023104,  0.092576],
       [90.      ,  0.002353,  0.472583,  0.016467,  0.096928],
       [92.      ,  0.443573,  0.00097 ,  0.021082,  0.000006],
       [95.      ,  0.000733,  0.016484,  0.017433,  0.101593],
       [99.      ,  0.021019,  0.102627,  0.021019,  0.102627]])

Oficial weight shift code below

In [4]:
def migrate_sub(prior_table_, sub_table_, sub__, rs_bins_):
    sub_ = np.copy(sub__)
    # Step I - build migration matrix

    interp = 1.0  # % of densityt to migrate
    migration_matrix = np.zeros(sub_[:, 1:-1].shape)

    for col_num in np.arange(sub_[:, 1:-1].shape[1]):  # For each class except 99 determine const scale factor

        # Bin 0 - galactic
        migration_matrix[rs_bins_ == 0, col_num] = interp * prior_table_[col_num, 0] / sub_table_[col_num, 0]

        # Bins 1-9 - extragalactic
        migration_matrix[rs_bins_ != 0, col_num] = interp * prior_table_[col_num, 1] / sub_table_[col_num, 1]

    migration_matrix = sub_[:, 1:-1] * (1 - migration_matrix)

    # Step II - migrate probabilities per row

    # For each submission row . . .
    for i, (sub_line, mm_line) in tqdm.tqdm(enumerate(zip(sub_[:,1:-1], migration_matrix)), total=sub_.shape[0]):

        # Get positive col indexes - classes where probs need to go down
        pos_cols = np.where(mm_line > 0)[0]

        # Get negative col indexes sorted by descending sub confidence
        # This makes sense since we'll transfer superavit probability elsewhere where we are more confident
        neg_cols = np.where(mm_line < 0)[0]
        sorted_ixs = np.argsort(sub_line[neg_cols])[::-1]
        neg_cols = neg_cols[sorted_ixs]

        # For each positive col try to empty it across neg cols
        for pos_col in pos_cols:
            budget = mm_line[pos_col]
            for neg_col in neg_cols:
                if mm_line[neg_col] == 0:  # Neg col already satisfied
                    continue

                budget += mm_line[neg_col]  # Update budget

                if budget <= 0:
                    sub_line[pos_col] -= mm_line[pos_col]
                    sub_line[neg_col] += mm_line[pos_col]
                    mm_line[neg_col] += mm_line[pos_col]
                    break  # Budget just ran out, next pos col - must break out to pos cols loop
                else:
                    sub_line[pos_col] += mm_line[neg_col]
                    sub_line[neg_col] += -mm_line[neg_col]  # - to get abs val
                    mm_line[pos_col] += mm_line[neg_col] # Update migration superavit
                    mm_line[neg_col] = 0  # Neg col satisfied

        # Assign migrated line
        sub_[i, 1:-1] = sub_line
        assert not np.any(sub_line<0)

    return sub_

In [6]:
test_sub = np.array(
    [
        [np.inf, .2, .3, .1, .3, .1],
        [np.inf, .3, .3, .2, .1, .1],
        [np.inf, .5, .1, .1, .2, .1],
        [np.inf, .1, .7, .1, .0, .1],
    ]
)

test_rs_bins = np.array([0,0,1,1])
test_sub_table = build_sub_table(test_sub, test_rs_bins)
test_prior_table = np.array(
    [
        [0.25-.1, 0.3-.2 ],
        [0.3-.1,  0.4-.1 ],
        [0.15-.1, 0.1 ],
        [0.2+.4,  0.1+.2 ],
        [0.1,  0.1 ],
    ]
)

new_sub = migrate_sub(test_prior_table, test_sub_table, test_sub, test_rs_bins)
print(test_sub)
print(new_sub)

100%|██████████| 4/4 [00:00<00:00, 6620.84it/s]

[[inf 0.2 0.3 0.1 0.3 0.1]
 [inf 0.3 0.3 0.2 0.1 0.1]
 [inf 0.5 0.1 0.1 0.2 0.1]
 [inf 0.1 0.7 0.1 0.  0.1]]
[[     inf 0.12     0.2      0.033333 0.546667 0.1     ]
 [     inf 0.18     0.22     0.2      0.3      0.1     ]
 [     inf 0.166667 0.075    0.1      0.558333 0.1     ]
 [     inf 0.1      0.7      0.1      0.       0.1     ]]





In [5]:
prior_table = build_prior_table(approx_freqs)
sub_table = build_sub_table(sub, rs_bins)
shifted_sub = migrate_sub(prior_table, sub_table, sub, rs_bins)

100%|██████████| 3492890/3492890 [02:44<00:00, 21197.73it/s]


In [10]:
prior_table = build_prior_table(approx_freqs)
sub_table = build_sub_table(shifted_sub, rs_bins)
shifted_sub_ii = migrate_sub(prior_table, sub_table, shifted_sub, rs_bins)

  0%|          | 4785/3492890 [00:00<02:26, 23855.35it/s]

1


100%|██████████| 3492890/3492890 [02:09<00:00, 27067.30it/s]


[[0.22235  0.003697]
 [0.014499 0.105821]
 [0.275328 0.00403 ]
 [0.020903 0.093267]
 [0.019687 0.093689]
 [0.02076  0.000003]
 [0.017692 0.101537]
 [0.014418 0.105859]
 [0.277243 0.004046]
 [0.018014 0.09432 ]
 [0.023104 0.092576]
 [0.016467 0.096928]
 [0.021082 0.000006]
 [0.017433 0.101593]
 [0.021019 0.102627]]
[[0.111844 0.00335 ]
 [0.008486 0.105431]
 [0.269345 0.003644]
 [0.012511 0.093268]
 [0.0119   0.091715]
 [0.008176 0.000003]
 [0.010746 0.100327]
 [0.00824  0.093854]
 [0.229031 0.003665]
 [0.010975 0.090661]
 [0.016403 0.055125]
 [0.010238 0.179446]
 [0.258448 0.000436]
 [0.012636 0.076448]
 [0.021019 0.102627]]
[[0.066017 0.002772]
 [0.00561  0.104154]
 [0.268952 0.002974]
 [0.005592 0.094986]
 [0.004442 0.08322 ]
 [0.004484 0.000015]
 [0.005139 0.096502]
 [0.003666 0.066694]
 [0.228962 0.002972]
 [0.004542 0.079497]
 [0.007136 0.047657]
 [0.006115 0.261909]
 [0.36265  0.000615]
 [0.005671 0.053405]
 [0.021019 0.102627]]
[[0.066017 0.002772]
 [0.00561  0.104154]
 [0.268952

In [7]:
# Save shifted sub

# Get submission header
col_names = list(pd.read_csv(filepath_or_buffer='../data/sample_submission.csv', nrows=1).columns)
num_classes = len(col_names) - 1

h = ''
for s in col_names:
    h += s + ','
h = h[:-1]

np.savetxt(
    fname='../subs/sub_v3_isband_0.7691_testdrive_1.csv',
    X=shifted_sub,
    fmt=['%d'] + ['%.4f'] * num_classes,
    delimiter=',',
    header=h,
    comments='',
)

In [13]:
first_table = build_sub_table(sub, rs_bins)
new_table = build_sub_table(shifted_sub, rs_bins)
newest_table = build_sub_table(shifted_sub_ii, rs_bins)
print(prior_table)
print(newest_table)
print(new_table)
print(first_table)

# Save twice-shifted sub

# Get submission header
col_names = list(pd.read_csv(filepath_or_buffer='../data/sample_submission.csv', nrows=1).columns)
num_classes = len(col_names) - 1

h = ''
for s in col_names:
    h += s + ','
h = h[:-1]

np.savetxt(
    fname='exp_mnm_sub_1.0656.csv',
    X=shifted_sub_ii,
    fmt=['%d'] + ['%.4f'] * num_classes,
    delimiter=',',
    header=h,
    comments='',
)

[[0.22235  0.003697]
 [0.014499 0.105821]
 [0.275328 0.00403 ]
 [0.020903 0.093267]
 [0.019687 0.093689]
 [0.02076  0.000003]
 [0.017692 0.101537]
 [0.014418 0.105859]
 [0.277243 0.004046]
 [0.018014 0.09432 ]
 [0.023104 0.092576]
 [0.016467 0.096928]
 [0.021082 0.000006]
 [0.017433 0.101593]
 [0.021019 0.102627]]
[[0.111844 0.00335 ]
 [0.008486 0.105431]
 [0.269345 0.003644]
 [0.012511 0.093268]
 [0.0119   0.091715]
 [0.008176 0.000003]
 [0.010746 0.100327]
 [0.00824  0.093854]
 [0.229031 0.003665]
 [0.010975 0.090661]
 [0.016403 0.055125]
 [0.010238 0.179446]
 [0.258448 0.000436]
 [0.012636 0.076448]
 [0.021019 0.102627]]
[[0.066017 0.002772]
 [0.00561  0.104154]
 [0.268952 0.002974]
 [0.005592 0.094986]
 [0.004442 0.08322 ]
 [0.004484 0.000015]
 [0.005139 0.096502]
 [0.003666 0.066694]
 [0.228962 0.002972]
 [0.004542 0.079497]
 [0.007136 0.047657]
 [0.006115 0.261909]
 [0.36265  0.000615]
 [0.005671 0.053405]
 [0.021019 0.102627]]
[[0.021896 0.000935]
 [0.00268  0.088455]
 [0.26784 