In [1]:
import pandas
import numpy as np

# Let's have a look at what my predictions actually look like

In [2]:
# load the prediction file
df = pandas.read_csv('./my_submission_gpu.csv')

In [3]:
# show the first 10 rows
df[:10]

Unnamed: 0,Image,Id
0,a779821d.jpg,new_whale w_2725793 w_303518a w_f3d98bc w_c663...
1,c064ec31.jpg,new_whale w_b6ed5d2 w_1a70685 w_3411b9f w_f864...
2,6a5e374d.jpg,w_1eafe46 new_whale w_95b6cc3 w_045d9fc w_e24a...
3,5df80c24.jpg,new_whale w_86b3f04 w_3a9ee71 w_83714b7 w_74ad...
4,68596681.jpg,new_whale w_8044362 w_b96d4e0 w_93bf889 w_97f5...
5,1ea8b7db.jpg,new_whale w_c27d036 w_12f2352 w_67de30b w_d89b...
6,554833b7.jpg,new_whale w_cf0c062 w_ccbc782 w_136337c w_676d...
7,1c05fdeb.jpg,new_whale w_2b939eb w_b6886e5 w_7427ace w_8d83...
8,ed7ddb31.jpg,new_whale w_434ad6a w_53064a6 w_1029f4e w_e02f...
9,b93181d3.jpg,new_whale w_2f283f3 w_6af9dd7 w_309a2b3 w_d663...


## How many times does new_whale appear?

In [4]:
# split the Id column into five separate columns for each guess.
# I also remove a trailing space here
data = np.array(df.Id.apply(lambda x: x.split(' ')).tolist())[:,:5]
df_new = pandas.DataFrame(data,columns=['First','Second','Third', 'Fourth', 'Fifth'])

In [5]:
n = []
for i in range(5):
    n.append(float((df_new.iloc[:,i]=='new_whale').sum())/float(len(df_new)))
    print '{:.4f} of the '.format(n[i]) + df_new.columns[i] + ' predictions are new_whale'


0.8966 of the First predictions are new_whale
0.0888 of the Second predictions are new_whale
0.0107 of the Third predictions are new_whale
0.0024 of the Fourth predictions are new_whale
0.0011 of the Fifth predictions are new_whale


This means that my model tends to pick new_whale as the top prediction. The reason is probably that new_whale is by far the most common class. However, there are 810 new_whale images in the training set and a total of about 10000 training images, so only every 12th whale is a new_whale. My model overestimates this. 

In [19]:
# reorder the columns so that the first column become the last column
df_reordered = df_new[['Second','Third','Fourth','Fifth','First']]
df_reordered = df_reordered.apply(lambda x: ' '.join(x),axis=1)
# df_reordered.columns=['Id']
df_reordered[:10]

0    w_2725793 w_303518a w_f3d98bc w_c663985 new_whale
1    w_b6ed5d2 w_1a70685 w_3411b9f w_f86488a new_whale
2    new_whale w_95b6cc3 w_045d9fc w_e24a84d w_1eafe46
3    w_86b3f04 w_3a9ee71 w_83714b7 w_74adf0b new_whale
4    w_8044362 w_b96d4e0 w_93bf889 w_97f5054 new_whale
5    w_c27d036 w_12f2352 w_67de30b w_d89b29e new_whale
6    w_cf0c062 w_ccbc782 w_136337c w_676ddb0 new_whale
7    w_2b939eb w_b6886e5 w_7427ace w_8d83172 new_whale
8    w_434ad6a w_53064a6 w_1029f4e w_e02fe7b new_whale
9    w_2f283f3 w_6af9dd7 w_309a2b3 w_d663f4f new_whale
dtype: object

In [20]:
# combine this with the image names
df_reordered = pandas.concat([df.Image,df_reordered],axis=1)
df_reordered.columns = ['Image', 'Id']

In [24]:
df_reordered[:10]

Unnamed: 0,Image,Id
0,a779821d.jpg,w_2725793 w_303518a w_f3d98bc w_c663985 new_whale
1,c064ec31.jpg,w_b6ed5d2 w_1a70685 w_3411b9f w_f86488a new_whale
2,6a5e374d.jpg,new_whale w_95b6cc3 w_045d9fc w_e24a84d w_1eafe46
3,5df80c24.jpg,w_86b3f04 w_3a9ee71 w_83714b7 w_74adf0b new_whale
4,68596681.jpg,w_8044362 w_b96d4e0 w_93bf889 w_97f5054 new_whale
5,1ea8b7db.jpg,w_c27d036 w_12f2352 w_67de30b w_d89b29e new_whale
6,554833b7.jpg,w_cf0c062 w_ccbc782 w_136337c w_676ddb0 new_whale
7,1c05fdeb.jpg,w_2b939eb w_b6886e5 w_7427ace w_8d83172 new_whale
8,ed7ddb31.jpg,w_434ad6a w_53064a6 w_1029f4e w_e02fe7b new_whale
9,b93181d3.jpg,w_2f283f3 w_6af9dd7 w_309a2b3 w_d663f4f new_whale


In [22]:
df_reordered.to_csv('./my_submission_reordered.csv',index=False)