In [1]:
import numpy as np
from scipy import io
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from joblib import Parallel, delayed
import multiprocessing

%matplotlib inline
import mpld3
#mpld3.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

- On convertit le score en distance avec $d=\frac{1}{score}10^4$.
- On considère alors que chaque fonction de similarité défini une distance selon une dimension par rapport à la sonde.
- On va donc apprendre une métrique dans un espace à 8 dimensions, de façon a rapprocher de la sonde les labels identiques à celui de la sonde, et à éloigner les labels différents.



## preprocesings

La première étape est de mettre les données dans un format plus pratique: 
- pour chaque sonde, identifier les Ids présents dans les 8 similarités
- ranger les données dans un tableau $$| Ids\ |\ distance\ 1\ |\ 0\ |\ 0\ |\ ...\ |\ 0\ | \\
                                       | Ids\ |\ 0\ |\ 0\ |\ 0\ |\ ...\ |\ distance\ 8\ |$$
- éventuellement, pour chaque sonde, ne garder que les n plus proches labels en distance euclidienne (on a vu dans le notebook premiers tests, que les 7 premières lignes des scores contiennent majoritairement le bon label)

In [2]:
train = io.loadmat('/home/max/projects/challengeMDI343/data/data_train.mat')

In [15]:
probes = { train['probeId'][i][0]: train['probeLabel'][i][0] for i in range(len(train['probeLabel']))}

print probes[7490]

gallery = {train['galleryId'][i][0]: train['galleryLabel'][i][0] for i in range(len(train['galleryLabel']))}

print gallery[3]

13721
45407


In [91]:
resultsId = train['resultsId']
resultsScore = train['resultsScore']

probe = 508

uniques, counts = np.unique(resultsId[probe,:,:3], return_counts=True)

print uniques
print counts
print
print resultsId[probe,:,:3]
print resultsScore[probe,:,:3]

valids = uniques[np.argmax(counts)]

print valids

print 'label:', gallery[valids], train['probeLabel'][probe][0]
print valids.shape


[  7749  13905  18702  20298  22186  29083  38254  40170  51727  69207
  75735  78090  83237  96183 102893 117856 131688 132570 141479]
[2 1 1 1 2 1 1 3 1 1 1 2 1 1 1 1 1 1 1]

[[ 40170 132570  69207]
 [  7749  75735  78090]
 [ 78090  40170  22186]
 [  7749  96183  83237]
 [102893  20298  18702]
 [ 51727  13905  29083]
 [131688  22186  40170]
 [141479 117856  38254]]
[[ 2465.19580078  2455.03491211  2447.36914062]
 [ 3184.73852539  3156.0300293   3145.10839844]
 [ 2712.62866211  2693.40576172  2620.91699219]
 [ 3011.54418945  2921.64428711  2832.53515625]
 [ 3519.50512695  3434.94897461  3397.61303711]
 [ 3220.49511719  3154.54638672  3151.65209961]
 [ 3339.83837891  3323.41186523  3256.28051758]
 [ 3063.90380859  3041.28613281  2980.87182617]]
40170
label: 49686 49686
()


In [87]:
probe = 508

test = np.asarray([[gallery[resultsId[probe,i,j]] == train['probeLabel'][probe][0] for j in range(7)] for i in range(8)])

print test

[[ True False False False False False False]
 [False False False False False False False]
 [False  True False False False False False]
 [False False False False False False False]
 [False False False False False False False]
 [False False False False False False False]
 [False False  True False False False False]
 [False False False False False False False]]


In [95]:
resultsId = train['resultsId']
resultsScore = train['resultsScore']

probe = 0

zeros = [0 for i in range(10)]
coords = []
for i in range(resultsId[probe,:,:3].shape[0]):
    for j in range(resultsId[probe,:,:3].shape[1]):
        c = list(zeros)
        c[8] = resultsId[probe,i,j]
        c[9] = gallery[resultsId[probe,i,j]]
        c[i] = resultsScore[probe,i,j]
        coords.append(c)
        
coords = np.asarray(coords)

print coords
print coords.shape

[[   2732.52172852       0.               0.               0.               0.
        0.               0.               0.           41004.           13721.        ]
 [   2544.44213867       0.               0.               0.               0.
        0.               0.               0.           29667.           26694.        ]
 [   2538.05810547       0.               0.               0.               0.
        0.               0.               0.          130431.            7610.        ]
 [      0.            4321.46191406       0.               0.               0.
        0.               0.               0.           78881.            5487.        ]
 [      0.            4164.02246094       0.               0.               0.
        0.               0.               0.           41004.           13721.        ]
 [      0.            3976.41894531       0.               0.               0.
        0.               0.               0.           78720.           37921.        

In [16]:
def to_coordinates(num):

    resultsId = train['resultsId']
    resultsScore = train['resultsScore']
    probeId = train['probeId']
    probeLabel = train['probeLabel']

    zeros = [0 for i in range(12)]
    coords = []

    for probe in range(len(resultsId)):
        for i in range(resultsId.shape[1]):
            for j in range(num):
                
                c = list(zeros)

                # label of the data point
                c[8] = resultsId[probe,i,j]
                c[9] = gallery[resultsId[probe,i,j]]

                # label of the probe
                c[10] = probeId[probe][0]
                c[11] = probeLabel[probe][0]

                # coordinate of the data point
                c[i] = resultsScore[probe,i,j]

                coords.append(c)

    coords = np.asarray(coords)
    
    # name of columns for the dataframe
    cols = ['x'+str(i+1) for i in range(8)] + ['point_id','point_label', 'probe_id', 'probe_label']
    
    df = pd.DataFrame(data = coords, columns=cols)
    return df

df = to_coordinates(20)

print df.shape

df.head()

(7578080, 12)


Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,point_id,point_label,probe_id,probe_label
0,2732.521729,0,0,0,0,0,0,0,41004,13721,7490,13721
1,2544.442139,0,0,0,0,0,0,0,29667,26694,7490,13721
2,2538.058105,0,0,0,0,0,0,0,130431,7610,7490,13721
3,2537.544189,0,0,0,0,0,0,0,116675,65463,7490,13721
4,2533.375488,0,0,0,0,0,0,0,127651,17181,7490,13721


In [111]:
#df.to_csv('formated_train_data_7l.csv')

In [17]:
#df.to_pickle('formated_train_data_20l.pkl')

In [18]:
df = pd.read_pickle('formated_train_data_20l.pkl')

print df.shape
df.head()

(7578080, 12)


Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,point_id,point_label,probe_id,probe_label
0,2732.521729,0,0,0,0,0,0,0,41004,13721,7490,13721
1,2544.442139,0,0,0,0,0,0,0,29667,26694,7490,13721
2,2538.058105,0,0,0,0,0,0,0,130431,7610,7490,13721
3,2537.544189,0,0,0,0,0,0,0,116675,65463,7490,13721
4,2533.375488,0,0,0,0,0,0,0,127651,17181,7490,13721


In [6]:
df.describe()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,point_id,point_label,probe_id,probe_label
count,2652328.0,2652328.0,2652328.0,2652328.0,2652328.0,2652328.0,2652328.0,2652328.0,2652328.0,2652328.0,2652328.0,2652328.0
mean,317.193976,488.757975,417.756337,470.338197,731.06906,745.151575,748.055003,537.90737,79629.018861,34906.027444,79664.977282,30582.892258
std,840.076183,1337.875644,1157.832822,1329.325981,2602.840844,3067.216146,3350.579027,1473.036401,46009.042108,19707.173102,45965.779336,15058.628588
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,8.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39696.0,18063.0,39917.0,18284.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,79610.0,34752.0,79676.0,30166.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,119618.0,50431.0,119289.0,41977.0
max,2976.69458,9943.139648,9670.283203,15269.108398,50000.0,50000.0,50000.0,10201.608398,159297.0,72036.0,159298.0,72022.0


## Tests avec KNN

In [23]:

groups = df.groupby('probe_id')#, as_index=False)

k = 0

predLabel = {}

def find_label(name, group):
    #print k, 'probe_id:', name
    group['dist'] = group.ix[:,'x1':'x8'].sum(axis=1)
    
    group.sort_values(by='dist', axis=0, inplace=True, ascending=False)
    
    knn = group['point_label'].iloc[:100].values.astype('int')

    uniques, count = np.unique(knn, return_counts=True)
    label = uniques[np.argmax(count)]
    
    return {int(name):label}


# parallelization
num_cores = multiprocessing.cpu_count()

predLabel = Parallel(n_jobs=num_cores - 1)(delayed(find_label)(name, group) for name, group in groups)
   
predLabel = { k.items()[0][0]: k.items()[0][1] for k in predLabel }
print    
#print predLabel

trains = train['probeId'][:, 0]

preds = [predLabel[i] for i in trains]

#print preds

accuracy = np.mean(preds == train['probeLabel'][:, 0])


print
print(accuracy)



0.938897451597


In [9]:
# 100-NN: acc=0.96085

a
1
