In [1]:
import os
import numpy as np 
import pandas as pd 
from subprocess import check_output

In [2]:
sub_path = "Submissions"
all_files = os.listdir(sub_path)

# Read and concatenate submissions
outs = [pd.read_csv(os.path.join(sub_path, f), index_col=0) for f in all_files]
concat_sub = pd.concat(outs, axis=1)
cols = list(map(lambda x: "is_iceberg_" + str(x), range(len(concat_sub.columns))))
concat_sub.columns = cols
concat_sub.reset_index(inplace=True)
concat_sub.head()

Unnamed: 0,id,is_iceberg_0,is_iceberg_1,is_iceberg_2,is_iceberg_3,is_iceberg_4,is_iceberg_5
0,5941774d,0.02138313,0.010011,0.003657,0.016455,0.22013,0.023473
1,4023181e,0.6183937,0.404991,0.020802,0.986631,0.014379,0.375685
2,b20200e4,9.098931e-07,0.007189,0.001704,0.001559,4.1e-05,0.006849
3,e7f018bb,0.9738217,0.997395,0.999148,0.999509,0.992729,0.998656
4,4371c8c3,0.4257437,0.067517,0.001759,0.997644,0.074289,0.035506


In [3]:
# check correlation
concat_sub.corr()

Unnamed: 0,is_iceberg_0,is_iceberg_1,is_iceberg_2,is_iceberg_3,is_iceberg_4,is_iceberg_5
is_iceberg_0,1.0,0.703407,0.630337,0.474636,0.601516,0.70471
is_iceberg_1,0.703407,1.0,0.834851,0.532091,0.843965,0.842419
is_iceberg_2,0.630337,0.834851,1.0,0.435538,0.842089,0.811806
is_iceberg_3,0.474636,0.532091,0.435538,1.0,0.409355,0.642066
is_iceberg_4,0.601516,0.843965,0.842089,0.409355,1.0,0.786657
is_iceberg_5,0.70471,0.842419,0.811806,0.642066,0.786657,1.0


In [4]:
# get the data fields ready for stacking
concat_sub['is_iceberg_max'] = concat_sub.iloc[:, 1:7].max(axis=1)
concat_sub['is_iceberg_min'] = concat_sub.iloc[:, 1:7].min(axis=1)
concat_sub['is_iceberg_mean'] = concat_sub.iloc[:, 1:7].mean(axis=1)
concat_sub['is_iceberg_median'] = concat_sub.iloc[:, 1:7].median(axis=1)

In [5]:
# set up cutoff threshold for lower and upper bounds, easy to twist 
cutoff_lo = 0.5
cutoff_hi = 0.5

In [6]:
# load the model with best base performance
sub_base = pd.read_csv('Submissions/subvgg16mobile.csv')

In [7]:
concat_sub['is_iceberg_base'] = sub_base['is_iceberg']
concat_sub['is_iceberg'] = np.where(np.all(concat_sub.iloc[:,1:7] > cutoff_lo, axis=1), 
                                    concat_sub['is_iceberg_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:7] < cutoff_hi, axis=1),
                                             concat_sub['is_iceberg_min'], 
                                             concat_sub['is_iceberg_base']))
concat_sub[['id', 'is_iceberg']].to_csv('stack_minmax_bestbase.csv', 
                                        index=False, float_format='%.6f')

In [46]:
concat_sub['is_iceberg'] = np.where(np.all(concat_sub.iloc[:,1:7] > cutoff_lo, axis=1), 
                                    concat_sub['is_iceberg_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:7] < cutoff_hi, axis=1),
                                             concat_sub['is_iceberg_min'], 
                                             concat_sub['is_iceberg_median']))
concat_sub[['id', 'is_iceberg']].to_csv('stack_minmax_median.csv', 
                                        index=False, float_format='%.6f')

In [51]:
concat_sub['is_iceberg'] = concat_sub['is_iceberg_median']

for index, row in concat_sub.iterrows():
    high_count = 0
    low_count = 0
    
    for model_result in row[1:7]:
        if model_result > cutoff_lo:
            high_count += 1
        elif model_result < cutoff_hi:
            low_count += 1
    
    if high_count == 6:
        concat_sub.set_value(index, 'is_iceberg', row['is_iceberg_max'])
    elif low_count == 6:
        concat_sub.set_value(index, 'is_iceberg', row['is_iceberg_min'])
    elif high_count == 5:
        high_val = (row['is_iceberg_max'] + row['is_iceberg_median']) / 2
        concat_sub.set_value(index, 'is_iceberg', high_val)
    elif low_count == 5:
        low_val = (row['is_iceberg_min'] + row['is_iceberg_median']) / 2
        concat_sub.set_value(index, 'is_iceberg', low_val)
        
concat_sub[['id', 'is_iceberg']].to_csv('stack_minmax_median_alt.csv', 
                                        index=False, float_format='%.6f')

In [50]:
concat_sub[concat_sub["is_iceberg"] != concat_sub["is_iceberg_alt"]]


Unnamed: 0,id,is_iceberg_0,is_iceberg_1,is_iceberg_2,is_iceberg_3,is_iceberg_4,is_iceberg_5,is_iceberg_max,is_iceberg_min,is_iceberg_mean,is_iceberg_median,is_iceberg_base,is_iceberg,is_iceberg_alt
4,4371c8c3,4.257437e-01,0.067517,0.001759,0.997644,0.074289,0.035506,0.997644,1.758835e-03,0.267076,0.070903,0.035506,0.070903,0.036331
11,e04e9775,4.060032e-01,0.124353,0.024866,0.995020,0.001158,0.059938,0.995020,1.157542e-03,0.268556,0.092145,0.059938,0.092145,0.046651
14,139e5324,7.169531e-09,0.103032,0.008062,0.932661,0.010557,0.087383,0.932661,7.169531e-09,0.190283,0.048970,0.087383,0.048970,0.024485
17,d9aa7a56,9.999011e-01,0.175558,0.171682,0.392820,0.002305,0.250945,0.999901,2.305158e-03,0.332202,0.213252,0.250945,0.213252,0.107778
18,9005b143,1.605655e-03,0.284177,0.032099,0.647836,0.035335,0.244129,0.647836,1.605655e-03,0.207530,0.139732,0.244129,0.139732,0.070669
20,9ad70954,3.063548e-01,0.046012,0.001965,0.997684,0.040554,0.048084,0.997684,1.964735e-03,0.240109,0.047048,0.048084,0.047048,0.024506
21,b9087b9e,4.561952e-01,0.348431,0.391608,0.140065,0.359620,0.511278,0.511278,1.400653e-01,0.367866,0.375614,0.511278,0.375614,0.257840
28,be8fa29c,4.710481e-01,0.016575,0.053783,0.970119,0.116634,0.079393,0.970119,1.657543e-02,0.284592,0.098013,0.079393,0.098013,0.057294
31,4deeeb8b,7.553215e-01,0.305090,0.691408,0.931118,0.505373,0.644251,0.931118,3.050901e-01,0.638760,0.667830,0.644251,0.667830,0.799474
33,472dadb5,1.462375e-08,0.068933,0.018504,0.897149,0.006749,0.105518,0.897149,1.462375e-08,0.182809,0.043719,0.105518,0.043719,0.021859


In [65]:
blah2 = np.where(np.all(concat_sub.iloc[:,1:7] > cutoff_lo, axis=1), 
                                    concat_sub['is_iceberg_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:7] < cutoff_hi, axis=1),
                                             concat_sub['is_iceberg_min'], 
                                             concat_sub['is_iceberg_median']))


In [94]:
blah2 = np.all(concat_sub.iloc[:,1:7] > 0.8, axis=1)
print(blah2[blah2 == True].shape)
blah2 = np.all(concat_sub.iloc[:,1:7] > 0.7, axis=1)
print(blah2[blah2 == True].shape)
blah2 = np.all(concat_sub.iloc[:,1:7] > 0.6, axis=1)
print(blah2[blah2 == True].shape)
blah_iceberg = np.all(concat_sub.iloc[:,1:7] > 0.5, axis=1)
print(blah_iceberg[blah_iceberg == True].shape)

blah2 = np.all(concat_sub.iloc[:,1:7] < 0.2, axis=1)
print(blah2[blah2 == True].shape)
blah2 = np.all(concat_sub.iloc[:,1:7] < 0.3, axis=1)
print(blah2[blah2 == True].shape)
blah2 = np.all(concat_sub.iloc[:,1:7] < 0.4, axis=1)
print(blah2[blah2 == True].shape)
blah_ship = np.all(concat_sub.iloc[:,1:7] < 0.5, axis=1)
print(blah_ship[blah_ship == True].shape)

len_all = len(concat_sub)
len_iceberg_processed = len(blah_iceberg[blah_iceberg == True])
len_ship_processed = len(blah_ship[blah_ship == True])
len_unprocessed = len_all - len_iceberg_processed - len_ship_processed
print(len_all)
print(len_iceberg_processed)
print(len_ship_processed)
print(len_unprocessed)


# Things to try:
# - Straight (0.6, 0.4) and (0.5, 0.5) consensus thresholds
# - Calculate the difference between the median value and the min / max value
# - If thresholds / consensus is "good enough", use diff b/w median and min / max
# - Maybe see if there's consensus between the top 2 / 3 models... if so, use diff b/w median and min / max

(937,)
(1098,)
(1220,)
(1326,)
(1828,)
(2068,)
(2278,)
(2445,)
8424
1326
2445
4653


In [67]:
# Find values that are on the fence...

def test(val):
    if val < 0.5:
        return -1
    elif val > 0.5:
        return -1
    
    return val 

blah = concat_sub.apply(lambda x: test(x["is_iceberg"]), axis=1)
blah = blah[blah >= 0]

In [68]:
print(blah.shape)
blah

(0,)


Series([], dtype: int64)

In [69]:
concat_sub

Unnamed: 0,id,is_iceberg_0,is_iceberg_1,is_iceberg_2,is_iceberg_3,is_iceberg_4,is_iceberg_5,is_iceberg_max,is_iceberg_min,is_iceberg_mean,is_iceberg_median,is_iceberg_base,is_iceberg
0,5941774d,2.138313e-02,1.001100e-02,0.003657,1.645529e-02,0.220130,2.347304e-02,0.220130,3.657331e-03,0.049185,0.018919,2.347304e-02,3.657331e-03
1,4023181e,6.183937e-01,4.049912e-01,0.020802,9.866307e-01,0.014379,3.756849e-01,0.986631,1.437879e-02,0.403480,0.390338,3.756849e-01,3.903381e-01
2,b20200e4,9.098931e-07,7.188838e-03,0.001704,1.559171e-03,0.000041,6.848715e-03,0.007189,9.098931e-07,0.002890,0.001632,6.848715e-03,9.098931e-07
3,e7f018bb,9.738217e-01,9.973954e-01,0.999148,9.995090e-01,0.992729,9.986560e-01,0.999509,9.738217e-01,0.993543,0.998026,9.986560e-01,9.995090e-01
4,4371c8c3,4.257437e-01,6.751659e-02,0.001759,9.976440e-01,0.074289,3.550633e-02,0.997644,1.758835e-03,0.267076,0.070903,3.550633e-02,7.090292e-02
5,a8d9b1fd,4.091924e-05,1.395310e-01,0.036811,8.380730e-01,0.082405,5.532939e-01,0.838073,4.091924e-05,0.275026,0.110968,5.532939e-01,1.109680e-01
6,29e7727e,1.594431e-01,1.177437e-03,0.061324,1.044588e-01,0.034815,3.812961e-02,0.159443,1.177437e-03,0.066558,0.049727,3.812961e-02,1.177437e-03
7,92a51ffb,9.893829e-01,9.997948e-01,0.999978,9.997451e-01,0.994780,9.985010e-01,0.999978,9.893829e-01,0.997030,0.999123,9.985010e-01,9.999777e-01
8,c769ac97,4.145949e-13,2.410319e-04,0.001725,9.538788e-07,0.087789,4.076076e-04,0.087789,4.145949e-13,0.015027,0.000324,4.076076e-04,4.145949e-13
9,aee0547d,1.269119e-16,1.919658e-04,0.004978,1.497171e-05,0.074415,2.713778e-04,0.074415,1.269119e-16,0.013312,0.000232,2.713778e-04,1.269119e-16
