In [93]:
import h5py
import numpy as np
import pandas as pd 
import bcolz 
from tqdm import tqdm_notebook

from sklearn.metrics import fbeta_score


In [94]:

def save_array(fname, arr):
    c=bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()


def load_array(fname):
    return bcolz.open(fname)[:]

In [187]:
y_train = []

df_train = pd.read_csv('../data/train_v2.csv')
df_test = pd.read_csv('../data/sample_submission_v2.csv')



# labels = ['blow_down',
#  'bare_ground',
#  'conventional_mine',
#  'blooming',
#  'cultivation',
#  'artisinal_mine',
#  'haze',
#  'primary',
#  'slash_burn',
#  'habitation',
#  'clear',
#  'road',
#  'selective_logging',
#  'partly_cloudy',
#  'agriculture',
#  'water',
#  'cloudy']

label_map_qin = {'slash_burn': 15, 'selective_logging': 14, 'cultivation': 8, 'clear': 5, 'habitation': 9, 
                 'conventional_mine': 7, 'cloudy': 6, 'primary': 12, 'water': 16, 'haze': 10, 'artisinal_mine': 1,
                 'partly_cloudy': 11, 'blooming': 3, 'bare_ground': 2, 'blow_down': 4, 'agriculture': 0, 'road': 13}

In [188]:
labels_qin = {v: k for k, v in label_map_qin.iteritems()}
labels_qin = labels_qin.values()

In [189]:
for f, tags in df_train.values:
    targets = np.zeros(17)
    
    for t in tags.split(' '):
        targets[label_map_qin[t]] = 1 
        

    y_train.append(targets)



In [190]:
y_train = np.array(y_train).astype(np.uint8)

In [99]:
def optimise_f2_thresholds2(y, p, verbose=True, resolution=100,num_classes=17):
    def mf(x):
        p2 = np.zeros_like(p)
        for i in range(num_classes):
            p2[:, i] = (p[:, i] > x[i]).astype(np.int)
        score = fbeta_score(y, p2, beta=2, average='samples')
        return score

    x = [0.1]*num_classes
    for i in range(num_classes):
        best_i2 = 0
        best_score = 0
        for i2 in range(resolution):
            threshold = float(i2) / resolution
            x[i] = threshold
            score = mf(x)
            if score > best_score:
                best_i2 = threshold
                best_score = score

        x[i] = best_i2
        if verbose:
            print(i, best_i2, best_score)

    return x




def f2_score(y_true, y_pred):
    # fbeta_score throws a confusing error if inputs are not numpy arrays
    y_true, y_pred, = np.array(y_true), np.array(y_pred)
    # We need to use average='samples' here, any other average method will generate bogus results
    return fbeta_score(y_true, y_pred, beta=2, average='samples')


In [125]:
# f = h5py.File("../data/qin_best_197_small.hdf", 'r')
# f.close()
# print("Keys: %s" % f.keys())


In [131]:
# filename = '../data/level9.one/eem-0.087937-134-2201-0-EFG/preds.hdf'
# filename = '../data/qin_best_197_small.hdf'

# f = h5py.File(filename, 'r')
# f.close()



In [132]:
filename = '../data/level9/raw.hdf'


val = pd.read_hdf(filename,key="ltest")
train = pd.read_hdf(filename,key="train")
test = pd.read_hdf(filename,key="test")

In [133]:
len(train.columns)

173

In [150]:
test_reindexed = test.set_index("ID")
test_reindexed = test_reindexed.reindex(df_test.image_name.values)

In [141]:
# val.iloc[:,-68:-51]

##### Xception

In [143]:
xce_val = val.iloc[:,-17:]
xce_val.index.values

array([ 5603, 33842, 20450, ...,  8004,  5722,  9884], dtype=int64)

In [18]:
# xce_val.columns

In [163]:
xce_test = test_reindexed.iloc[:,-17:]

In [20]:
xce_thres = optimise_f2_thresholds2(y_train[xce_val.index.values], xce_val.values,num_classes=17)


(0, 0.19, 0.91518970148397993)
(1, 0.31, 0.91521276411677599)
(2, 0.16, 0.91543667851716604)
(3, 0.35, 0.91583906172189156)
(4, 0.29, 0.91586244934454752)
(5, 0.18, 0.91633398416197043)


  'precision', 'predicted', average, warn_for)


(6, 0.02, 0.91794949447164298)
(7, 0.35, 0.91801249868858681)
(8, 0.25, 0.91878400429779261)
(9, 0.16, 0.91906297266641934)
(10, 0.23, 0.92008191082610147)
(11, 0.22, 0.92043449174919534)
(12, 0.42, 0.92147487123640437)
(13, 0.2, 0.92208318393244082)
(14, 0.1, 0.92208318393244082)
(15, 0.17, 0.92217719012046917)
(16, 0.2, 0.92326655380800082)


In [21]:
print('F2 Score:', f2_score(y_train[xce_val.index.values],  xce_val.values>xce_thres)) #combined_val_preds

('F2 Score:', 0.92326655380800082)


In [164]:
xce_final_preds = (xce_test.values > xce_thres).astype(np.uint8)

In [165]:
xce_final_preds.sum()

200312

##### VGG

In [24]:
vgg_val = val.iloc[:,-51:-34]


In [166]:
vgg_test = test_reindexed.iloc[:,-51:-34]

In [26]:
vgg_thres = optimise_f2_thresholds2(y_train[vgg_val.index.values], vgg_val.values,num_classes=17)


(0, 0.28, 0.91977851281452394)
(1, 0.09, 0.91983282168304958)
(2, 0.08, 0.91988035858152661)
(3, 0.6, 0.92000980932902698)
(4, 0.45, 0.92015170901825272)
(5, 0.14, 0.92034023256965636)
(6, 0.18, 0.9205037573793553)
(7, 0.45, 0.92058012084715546)
(8, 0.21, 0.9228929308127356)
(9, 0.23, 0.92439026733005425)
(10, 0.1, 0.92439026733005425)
(11, 0.22, 0.92482326865655251)
(12, 0.21, 0.92507610805063445)
(13, 0.16, 0.92633054979273699)
(14, 0.56, 0.92648596083741164)
(15, 0.49, 0.9266468785159796)
(16, 0.22, 0.92944812057889925)


In [27]:
print('F2 Score:', f2_score(y_train[vgg_val.index.values], vgg_val.values>vgg_thres)) #combined_val_preds

('F2 Score:', 0.92944812057889925)


In [167]:
vgg_final_preds = (vgg_test.values > vgg_thres).astype(np.uint8)

In [168]:
vgg_final_preds.sum()

202003

##### Resnet

In [30]:
# val.iloc[:,-85:-68].columns
res_val = val.iloc[:,-85:-68]

In [180]:
res_test = test_reindexed.iloc[:,-85:-68]

In [32]:
res_thres = optimise_f2_thresholds2(y_train[res_val.index.values],res_val.values,num_classes=17)


(0, 0.17, 0.92415894456522607)
(1, 0.34, 0.92418385567144556)
(2, 0.14, 0.92438120093270604)
(3, 0.41, 0.92495504019025665)
(4, 0.36, 0.92513381940680151)
(5, 0.09, 0.92525563742552985)
(6, 0.08, 0.92536324200355424)
(7, 0.39, 0.92544188498540247)
(8, 0.24, 0.9265741786985694)
(9, 0.14, 0.92667202435043006)
(10, 0.35, 0.92974958875983993)
(11, 0.17, 0.92990243645822612)
(12, 0.17, 0.93088477088376498)
(13, 0.21, 0.93152926880326747)
(14, 0.11, 0.93155607198759416)
(15, 0.14, 0.93162234880148553)
(16, 0.18, 0.93282146530750876)


In [33]:
print('F2 Score:', f2_score(y_train[res_val.index.values], res_val.values>res_thres)) #combined_val_preds

('F2 Score:', 0.93282146530750876)


In [170]:
res_final_preds = (res_test.values > res_thres).astype(np.uint8)

In [35]:
res_final_preds.sum()

201786

#### DenseNet

In [195]:
filename = '../data/qin_best_197.hdf'


val = pd.read_hdf(filename,key="ltest")
train = pd.read_hdf(filename,key="train")
test = pd.read_hdf(filename,key="test")

In [196]:
test = test.set_index("ID")
test = test.reindex(df_test.image_name.values)

In [197]:
dense_val = val.iloc[:,-17:]
dense_test = test.iloc[:,-17:]

In [192]:
dense_thres = optimise_f2_thresholds2(y_train[dense_val.index.values],dense_val.values,num_classes=17)


(0, 0.22, 0.9302514461567517)
(1, 0.13, 0.93026440753990436)
(2, 0.11, 0.93028355589934375)
(3, 0.2, 0.93064710211964219)
(4, 0.13, 0.93068305185225642)
(5, 0.17, 0.93093301406481344)
(6, 0.16, 0.93110486154448946)
(7, 0.15, 0.93111316829621726)
(8, 0.18, 0.93259060862638143)
(9, 0.21, 0.93361214357830502)
(10, 0.2, 0.93434046316182195)
(11, 0.21, 0.93471099842798189)
(12, 0.23, 0.93581739174032152)
(13, 0.28, 0.9367863760766123)
(14, 0.16, 0.93689018174377237)
(15, 0.07, 0.93692738981875689)
(16, 0.19, 0.93814425065446505)


In [198]:
print('F2 Score:', f2_score(y_train[dense_val.index.values], dense_val.values>dense_thres)) #combined_val_preds

('F2 Score:', 0.93814425065446505)


In [199]:
dense_final_preds = (dense_test.values > dense_thres).astype(np.uint8)

In [200]:
dense_final_preds.sum()

198132

### Armin's

In [39]:

armin_labels = ['blow_down',
 'bare_ground',
 'conventional_mine',
 'blooming',
 'cultivation',
 'artisinal_mine',
 'haze',
 'primary',
 'slash_burn',
 'habitation',
 'clear',
 'road',
 'selective_logging',
 'partly_cloudy',
 'agriculture',
 'water',
 'cloudy']

In [43]:
new_order = {'slash_burn': 15, 'selective_logging': 14, 
             'cultivation': 8, 'clear': 5, 'habitation': 9,
             'conventional_mine': 7, 'cloudy': 6, 'primary': 12, 
             'water': 16, 'haze': 10, 'artisinal_mine': 1, 
             'partly_cloudy': 11, 'blooming': 3, 'bare_ground': 2, 
             'blow_down': 4, 'agriculture': 0, 'road': 13}

inv_map = {v: k for k, v in new_order.iteritems()}
inv_map = inv_map.values()

In [45]:
armin_resnet = load_array("../data/oli_res_blend_01_qin_ordering.dat/")
armin_resnet.sum()

199574

In [65]:
armin_etr = load_array("../data/etr_tifonly_test_preds_01.dat/")
armin_etr = pd.DataFrame(armin_etr, columns = armin_labels)

armin_etr = armin_etr[inv_map].values

In [72]:
armin_xgb = load_array("../data/xgb_tifonly_test_preds_01.dat/")
armin_xgb = pd.DataFrame(armin_xgb, columns = armin_labels)

armin_xgb = armin_xgb[inv_map].values

In [73]:
armin_inception = pd.read_csv("../data/inception_full_150x150_10tta_augv2_01_qin_ordering.csv").values

In [74]:
armin_xception =  pd.read_csv("../data/xception_full_112x112_fullnetft_01_qin_ordering.csv").values

In [69]:
# armin_etr = load_array("../data/etr_test_preds_01_qin_order.dat/")
# armin_etr.sum()

In [140]:
df_test.shape[0] - (df_test.image_name == test.ID).sum()

20521

### Voting

In [221]:
voted = res_final_preds + vgg_final_preds + xce_final_preds + dense_final_preds \
 + armin_resnet + armin_inception+ armin_xception

In [222]:
result = pd.DataFrame(voted, columns = labels_qin)
result.head()

Unnamed: 0,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,habitation,haze,partly_cloudy,primary,road,selective_logging,slash_burn,water
0,0,0,0,0,0,5,0,2,0,0,2,1,5,0,1,1,0
1,0,0,0,0,0,5,0,2,0,0,2,1,5,0,1,1,0
2,0,0,0,0,0,0,0,2,0,0,1,6,5,1,1,1,0
3,5,0,0,0,0,5,0,2,5,0,2,1,5,1,2,1,0
4,0,0,0,0,0,0,3,2,0,0,1,6,4,1,1,1,1


In [223]:
label_preds = []
for i in tqdm_notebook(range(result.shape[0]), miniters=1000):
    a = result.ix[[i]]
    a = a.apply(lambda x: x >= 4, axis=1)
    a = a.transpose()
    a = a.loc[a[i] == True]
    ' '.join(list(a.index))
    label_preds.append(' '.join(list(a.index)))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  app.launch_new_instance()





In [226]:
df_test["tags"] = label_preds
df_test.head()

Unnamed: 0,image_name,tags
0,test_0,clear primary
1,test_1,clear primary
2,test_2,partly_cloudy primary
3,test_3,agriculture clear cultivation primary
4,test_4,partly_cloudy primary


In [227]:
df_test.tail(100)

Unnamed: 0,image_name,tags
61091,file_9909,cloudy
61092,file_991,clear primary
61093,file_9910,agriculture clear primary road
61094,file_9911,clear primary water
61095,file_9912,partly_cloudy primary
61096,file_9913,agriculture habitation partly_cloudy primary road
61097,file_9914,agriculture clear habitation partly_cloudy pri...
61098,file_9915,clear primary
61099,file_9916,cloudy
61100,file_9917,clear primary


In [225]:
df_test.tail(100)

Unnamed: 0,image_name,tags
61091,file_9909,cloudy
61092,file_991,clear primary
61093,file_9910,agriculture clear primary road
61094,file_9911,clear primary water
61095,file_9912,partly_cloudy primary
61096,file_9913,agriculture habitation partly_cloudy primary road
61097,file_9914,agriculture clear habitation partly_cloudy pri...
61098,file_9915,clear primary
61099,file_9916,cloudy
61100,file_9917,clear primary


In [228]:
submission_file = '../subm/voting_qinx4best_arminx3_min4.csv'
df_test.to_csv(submission_file, index=False)