# Evaluation with GAP 

We'll also evaluate our success with the GAP metric which is the [official metric for the comeptition](https://www.kaggle.com/c/landmark-recognition-2020/overview/evaluation). However, we'll check our success only on the landmarks and not on the whole dataset. 

In [55]:
# imports for code 
import pandas as pd
import numpy as np 

In [74]:
# load the following csv files as dataframe 
url_train = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/data/train/train.csv' 
url_test ='https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/data/test/test.csv'
url_test_more_classes1 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/data/test/more_classes/test_more_classes1.csv'
url_test_more_classes2 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/data/test/more_classes/test_more_classes2.csv'
url_test_more_classes3 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/data/test/more_classes/test_more_classes3.csv'
url_nn_3 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/feature_extraction/results_csv/nearest_neighbor_embedded_test_K%3D3_landmarks_only.csv'
url_nn_5 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/feature_extraction/results_csv/nearest_neighbor_embedded_test_K%3D5.csv'
url_nn_7 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/feature_extraction/results_csv/nearest_neighbor_embedded_test_K%3D7_landmarks_only.csv'

train_df = pd.read_csv(url_train)
test_df = pd.read_csv(url_test) 
test_more_classes1_df = pd.read_csv(url_test_more_classes1)
test_more_classes2_df = pd.read_csv(url_test_more_classes2)
test_more_classes3_df = pd.read_csv(url_test_more_classes3)
nn_3_df = pd.read_csv(url_nn_3)
nn_5_df = pd.read_csv(url_nn_5)
nn_7_df = pd.read_csv(url_nn_7)

In [75]:
def change_df(df): 
    """
    Changing the dataframe so it will be easier to work with. 
    Param: 
        df (pd.DataFrame): The dataframe to change 
    Return: 
        df (pd.DataFrame): The chnaged dataframe 
    """
    df = df.drop("Unnamed: 0", axis=1)
    df.insert(0, "id", test_df["id"], True) 
    return df 

nn_3_df = change_df(nn_3_df)
nn_5_df = change_df(nn_5_df)
nn_7_df = change_df(nn_7_df)

In [77]:
def train_indx_to_class(nn_df, train_df, K):
    """
    The nn_df hold the index of the matching neighbor in the train set, this function replace it with the matching class 
    Param: 
        nn_df (DataFrame): The nearest neighbors dataframe
        train_df (DataFrame): The train set dataframe
        K (int): The number of nearest neighbors 
    Return: 
        nn_df (DataFrame): Thenearest neighbors dataframe, each column k [k is in (0,1,2...K-1)] contain the
                           class of the (k+1) nearest neighbor
    """
    for k in range(K): 
        col_to_replace = train_df.loc[nn_df[str(k)]]["landmark_id"]
        nn_df[str(k)] = col_to_replace.values
    
    return nn_df 

nn_3_df = train_indx_to_class(nn_3_df, train_df, 3)
nn_5_df = train_indx_to_class(nn_5_df, train_df, 5)
nn_7_df = train_indx_to_class(nn_7_df, train_df, 7)

In [80]:
# create a series of all the indices where there is a landmark 
landmark_inidices = test_df['landmarks'] != 0 
landmark_inidices = landmark_inidices[landmark_inidices].index

# create a smaller dataframe of the test set that made out only of the landmarks 
landmarks_df = test_df.loc[landmark_inidices]
landmarks_df = landmarks_df.reset_index()
landmarks_more_classes1_df = test_more_classes1_df.loc[landmark_inidices]
landmarks_more_classes1_df = landmarks_more_classes1_df.reset_index()
landmarks_more_classes2_df = test_more_classes2_df.loc[landmark_inidices]
landmarks_more_classes2_df = landmarks_more_classes2_df.reset_index()
landmarks_more_classes3_df = test_more_classes3_df.loc[landmark_inidices]
landmarks_more_classes3_df = landmarks_more_classes3_df.reset_index()

nn_landmarks_5_df = nn_5_df.loc[landmark_inidices] 
nn_landmarks_5_df = nn_landmarks_5_df.reset_index()
nn_landmarks_5_df = nn_landmarks_5_df.drop("index", axis=1)

In [94]:
nn_7_df

Unnamed: 0,id,0,1,2,3,4,5,6
0,e324e0f3e6d9e504,42422,79959,138982,93154,147263,67416,180450
1,d9e17c5f3e0c47b3,14968,41941,95885,117418,38746,90025,64278
2,1a748a755ed67512,5156,164193,164193,67109,84309,151722,619
3,537bf9bdfccdafea,48328,69301,136675,158991,136675,174697,197341
4,13f4c974274ee08b,136675,202793,25369,187755,188686,14612,4103
...,...,...,...,...,...,...,...,...
1617,0d5c4cab8cf9deb6,29367,30181,29367,162403,162403,30181,162403
1618,cf493f239ec7d6dd,187717,161163,61082,66563,201554,100410,52674
1619,eb68d4f6f237baa0,107289,58419,149438,125881,124455,16832,34999
1620,10e63b3397470f5c,29515,174253,174253,174253,174253,133563,174253


In [96]:
nn_5_df

Unnamed: 0,id,0,1,2,3,4
0,e324e0f3e6d9e504,42422,79959,138982,93154,147263
1,d9e17c5f3e0c47b3,14968,41941,95885,117418,38746
2,1a748a755ed67512,5156,164193,164193,67109,84309
3,537bf9bdfccdafea,48328,69301,136675,158991,136675
4,13f4c974274ee08b,136675,202793,25369,187755,188686
...,...,...,...,...,...,...
117222,e351c3e672c25fbd,47663,190441,23777,23777,56062
117223,5426472625271a4d,54785,54785,54785,54785,113750
117224,7b6a585405978398,171111,112512,200128,21500,142109
117225,d885235ba249cf5d,162403,162403,162403,115930,136675


In [97]:
nn_3_df

Unnamed: 0,id,0,1,2
0,e324e0f3e6d9e504,42422,79959,138982
1,d9e17c5f3e0c47b3,14968,41941,95885
2,1a748a755ed67512,5156,164193,164193
3,537bf9bdfccdafea,48328,69301,136675
4,13f4c974274ee08b,136675,202793,25369
...,...,...,...,...
1617,0d5c4cab8cf9deb6,29367,30181,29367
1618,cf493f239ec7d6dd,187717,161163,61082
1619,eb68d4f6f237baa0,107289,58419,149438
1620,10e63b3397470f5c,29515,174253,174253


In [106]:
x = nn_7_df["0"] == landmarks_df["landmarks"]

In [101]:
K = 7 
N = landmarks_df.shape[0]

GAP_score = 0
for i in range(N): 
    for j in range(K):
        if nn_7_df.loc[i][j+1] == landmarks_df["landmarks"][i]: 
            GAP_score += 1 / K
            
print(GAP_score / N)

0.0


In [82]:
def GAP(N, K, nn_landmarks_df, ground_truth_df):
    """
    Calculating the GAP metric 
    Param: 
        N (int): Number of values in the test set 
        K (int): Number of neighbors 
        nn_ladnamrk_df (DataFrame): A dataframe of the nearest neighbors' class
        ground_truth_df (DataFrame): The true class of each item from the test set
    Return: 
        GAP_score (int): The calculated GAP_score 
    """
    GAP_score = 0
    for i in range(N): 
        for j in range(K):
             if nn_landmarks_df.loc[i][j+1] == ground_truth_df["landmarks"][i]: 
                    GAP_score += 1 / K
    return GAP_score / N

N = landmarks_df.shape[0]

### K = 3
GAP_3K_1 = GAP(N, 3, nn_3_df, landmarks_df)
GAP_3K_2 = GAP(N, 3, nn_3_df, landmarks_more_classes1_df)
GAP_3K_3 = GAP(N, 3, nn_3_df, landmarks_more_classes2_df)
GAP_3K_4 = GAP(N, 3, nn_3_df, landmarks_more_classes3_df)
GAP_3K = GAP_3K_1 + GAP_3K_2 + GAP_3K_3 + GAP_3K_4

print("The GAP score for K = 3 nearest neighbors is {:.3f}.".format(GAP_3K))

### K = 5
GAP_5K_1 = GAP(N, 5, nn_landmarks_5_df, landmarks_df)
GAP_5K_2 = GAP(N, 5, nn_landmarks_5_df, landmarks_more_classes1_df)
GAP_5K_3 = GAP(N, 5, nn_landmarks_5_df, landmarks_more_classes2_df)
GAP_5K_4 = GAP(N, 5, nn_landmarks_5_df, landmarks_more_classes3_df)
GAP_5K = GAP_5K_1 + GAP_5K_2 + GAP_5K_3 + GAP_5K_4

print("The GAP score for K = 5 nearest neighbors is {:.3f}.".format(GAP_5K))

### K = 7
GAP_7K_1 = GAP(N, 7, nn_7_df, landmarks_df)
GAP_7K_2 = GAP(N, 7, nn_7_df, landmarks_more_classes1_df)
GAP_7K_3 = GAP(N, 7, nn_7_df, landmarks_more_classes2_df)
GAP_7K_4 = GAP(N, 7, nn_7_df, landmarks_more_classes3_df)
GAP_7K = GAP_7K_1 + GAP_7K_2 + GAP_7K_3 + GAP_7K_4

print("The GAP score for K = 7 nearest neighbors is {:.3f}.".format(GAP_7K))

The GAP score for K=3 nearest neighbors is 0.0
The GAP score for K=5 nearest neighbors is 0.12355117139334036
The GAP score for K=7 nearest neighbors is 0.0
