# Evaluation with GAP 

We'll also evaluate our success with the GAP metric which is the [official metric for the comeptition](https://www.kaggle.com/c/landmark-recognition-2020/overview/evaluation).

The GAP metric:  $GAP=\frac{1}{N} \sum_{i=1}^{N} P(i)rel(i)  $

We'll check our success only on the landmarks and not on the whole dataset. 

In [2]:
# imports for code 
import pandas as pd
import numpy as np 

In [3]:
# load the following csv files as dataframe 
url_train = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/data/train/train.csv' 
url_test ='https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/data/test/test.csv'
url_test_more_classes1 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/data/test/more_classes/test_more_classes1.csv'
url_test_more_classes2 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/data/test/more_classes/test_more_classes2.csv'
url_test_more_classes3 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/data/test/more_classes/test_more_classes3.csv'
url_nn_3 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/feature_extraction/results_csv/nearest_neighbor_embedded_test_K%3D3_landmarks_only.csv'
url_nn_5 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/feature_extraction/results_csv/nearest_neighbor_embedded_test_K%3D5.csv'
url_nn_7 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/feature_extraction/results_csv/nearest_neighbor_embedded_test_K%3D7_landmarks_only.csv'

train_df = pd.read_csv(url_train)
test_df = pd.read_csv(url_test) 
test_more_classes1_df = pd.read_csv(url_test_more_classes1)
test_more_classes2_df = pd.read_csv(url_test_more_classes2)
test_more_classes3_df = pd.read_csv(url_test_more_classes3)
nn_3_df = pd.read_csv(url_nn_3)
nn_5_df = pd.read_csv(url_nn_5)
nn_7_df = pd.read_csv(url_nn_7)

In [4]:
def change_df(df): 
    """
    Changing the dataframe so it will be easier to work with. 
    Param: 
        df (pd.DataFrame): The dataframe to change 
    Return: 
        df (pd.DataFrame): The chnaged dataframe 
    """
    df = df.drop("Unnamed: 0", axis=1)
    df.insert(0, "id", test_df["id"], True) 
    return df 

nn_3_df = change_df(nn_3_df)
nn_5_df = change_df(nn_5_df)
nn_7_df = change_df(nn_7_df)

In [5]:
def train_indx_to_class(nn_df, train_df, K):
    """
    The nn_df hold the index of the matching neighbor in the train set, this function replace it with the matching class 
    Param: 
        nn_df (DataFrame): The nearest neighbors dataframe
        train_df (DataFrame): The train set dataframe
        K (int): The number of nearest neighbors 
    Return: 
        nn_df (DataFrame): Thenearest neighbors dataframe, each column k [k is in (0,1,2...K-1)] contain the
                           class of the (k+1) nearest neighbor
    """
    for k in range(K): 
        col_to_replace = train_df.loc[nn_df[str(k)]]["landmark_id"]
        nn_df[str(k)] = col_to_replace.values
    
    return nn_df 

nn_3_df = train_indx_to_class(nn_3_df, train_df, 3)
nn_5_df = train_indx_to_class(nn_5_df, train_df, 5)
nn_7_df = train_indx_to_class(nn_7_df, train_df, 7)

In [6]:
# create a series of all the indices where there is a landmark 
landmark_inidices = test_df['landmarks'] != 0 
landmark_inidices = landmark_inidices[landmark_inidices].index

# create a smaller dataframe of the test set that made out only of the landmarks 
landmarks_df = test_df.loc[landmark_inidices]
landmarks_df = landmarks_df.reset_index()
landmarks_more_classes1_df = test_more_classes1_df.loc[landmark_inidices]
landmarks_more_classes1_df = landmarks_more_classes1_df.reset_index()
landmarks_more_classes2_df = test_more_classes2_df.loc[landmark_inidices]
landmarks_more_classes2_df = landmarks_more_classes2_df.reset_index()
landmarks_more_classes3_df = test_more_classes3_df.loc[landmark_inidices]
landmarks_more_classes3_df = landmarks_more_classes3_df.reset_index()

nn_landmarks_5_df = nn_5_df.loc[landmark_inidices] 
nn_landmarks_5_df = nn_landmarks_5_df.reset_index()
nn_landmarks_5_df = nn_landmarks_5_df.drop("index", axis=1)

In [13]:
def naiveGAP(N, K, nn_landmarks_df, ground_truth_df):
    """
    Calculating the GAP metric, with equal precision to each neighbor.  
    Param: 
        N (int): Number of values in the test set 
        K (int): Number of neighbors 
        nn_ladnamrk_df (DataFrame): A dataframe of the nearest neighbors' class
        ground_truth_df (DataFrame): The true class of each item from the test set
    Return: 
        GAP_score (int): The calculated GAP_score 
    """
    GAP_score = 0
    for i in range(N): 
        for j in range(K):
             if nn_landmarks_df.loc[i][j+1] == ground_truth_df["landmarks"][i]: 
                    GAP_score += 1 / K
    return GAP_score / N

N = landmarks_df.shape[0]

### K = 3
naiveGAP_3K_1 = naiveGAP(N, 3, nn_3_df, landmarks_df)
naiveGAP_3K_2 = naiveGAP(N, 3, nn_3_df, landmarks_more_classes1_df)
naiveGAP_3K_3 = naiveGAP(N, 3, nn_3_df, landmarks_more_classes2_df)
naiveGAP_3K_4 = naiveGAP(N, 3, nn_3_df, landmarks_more_classes3_df)
naiveGAP_3K = naiveGAP_3K_1 + naiveGAP_3K_3 + naiveGAP_3K_3 + naiveGAP_3K_4

print("The GAP score for K = 3 nearest neighbors is {:.3f} where each neigbhor get equal precision.".format(naiveGAP_3K))

### K = 5
naiveGAP_5K_1 = naiveGAP(N, 5, nn_landmarks_5_df, landmarks_df)
naiveGAP_5K_2 = naiveGAP(N, 5, nn_landmarks_5_df, landmarks_more_classes1_df)
naiveGAP_5K_3 = naiveGAP(N, 5, nn_landmarks_5_df, landmarks_more_classes2_df)
naiveGAP_5K_4 = naiveGAP(N, 5, nn_landmarks_5_df, landmarks_more_classes3_df)
naiveGAP_5K = naiveGAP_5K_1 + naiveGAP_5K_2 + naiveGAP_5K_3 + naiveGAP_5K_4

print("The GAP score for K = 5 nearest neighbors is {:.3f} where each neigbhor get equal precision.".format(naiveGAP_5K))

### K = 7
naiveGAP_7K_1 = naiveGAP(N, 7, nn_7_df, landmarks_df)
naiveGAP_7K_2 = naiveGAP(N, 7, nn_7_df, landmarks_more_classes1_df)
naiveGAP_7K_3 = naiveGAP(N, 7, nn_7_df, landmarks_more_classes2_df)
naiveGAP_7K_4 = naiveGAP(N, 7, nn_7_df, landmarks_more_classes3_df)
naiveGAP_7K = naiveGAP_7K_1 + naiveGAP_7K_2 + naiveGAP_7K_3 + naiveGAP_7K_4

print("The GAP score for K = 7 nearest neighbors is {:.3f} where each neigbhor get equal precision.".format(naiveGAP_7K))

The GAP score for K = 3 nearest neighbors is 0.133 where each neigbhor get equal precision.
The GAP score for K = 5 nearest neighbors is 0.124 where each neigbhor get equal precision.
The GAP score for K = 7 nearest neighbors is 0.111 where each neigbhor get equal precision.


In [11]:
### K = 3
def GAP3(N, nn_landmarks_df, ground_truth_df):
    """
    Calculating the GAP metric where the closer the neighbor, the higher the precision, K=3. 
    Param: 
        N (int): Number of values in the test set 
        K (int): Number of neighbors 
        nn_ladnamrk_df (DataFrame): A dataframe of the nearest neighbors' class
        ground_truth_df (DataFrame): The true class of each item from the test set
    Return: 
        GAP_score (int): The calculated GAP_score 
    """
    GAP_score = 0
    for i in range(N): 
        for j in range(3):
             if nn_landmarks_df.loc[i][j+1] == ground_truth_df["landmarks"][i]:
                    if j == 0:
                        GAP_score += 0.5
                    if j == 1:
                        GAP_score += 0.3
                    if j == 2:
                        GAP_score += 0.2
    return GAP_score / N

N = landmarks_df.shape[0]

GAP_3K_1 = GAP3(N, nn_3_df, landmarks_df)
GAP_3K_2 = GAP3(N, nn_3_df, landmarks_more_classes1_df)
GAP_3K_3 = GAP3(N, nn_3_df, landmarks_more_classes2_df)
GAP_3K_4 = GAP3(N, nn_3_df, landmarks_more_classes3_df)
GAP_3K = GAP_3K_1 + GAP_3K_2 + GAP_3K_3 + GAP_3K_4

print("The GAP score for K = 3 nearest neighbors is {:.3f}.".format(GAP_3K))

The GAP score for K = 3 nearest neighbors is 0.156.


In [8]:
### K = 5
def GAP5(N, nn_landmarks_df, ground_truth_df):
    """
    Calculating the GAP metric where the closer the neighbor, the higher the precision, K=5. 
    Param: 
        N (int): Number of values in the test set 
        K (int): Number of neighbors 
        nn_ladnamrk_df (DataFrame): A dataframe of the nearest neighbors' class
        ground_truth_df (DataFrame): The true class of each item from the test set
    Return: 
        GAP_score (int): The calculated GAP_score 
    """
    GAP_score = 0
    for i in range(N): 
        for j in range(5):
             if nn_landmarks_df.loc[i][j+1] == ground_truth_df["landmarks"][i]:
                    if j == 0:
                        GAP_score += 0.4
                    if j == 1:
                        GAP_score += 0.3
                    if j == 2:
                        GAP_score += 0.2
                    if j == 3:
                        GAP_score += 0.075
                    if j == 4:
                        GAP_score += 0.025
    return GAP_score / N

N = landmarks_df.shape[0]

GAP_5K_1 = GAP5(N, nn_landmarks_5_df, landmarks_df)
GAP_5K_2 = GAP5(N, nn_landmarks_5_df, landmarks_more_classes1_df)
GAP_5K_3 = GAP5(N, nn_landmarks_5_df, landmarks_more_classes2_df)
GAP_5K_4 = GAP5(N, nn_landmarks_5_df, landmarks_more_classes3_df)
GAP_5K = GAP_5K_1 + GAP_5K_2 + GAP_5K_3 + GAP_5K_4

print("The GAP score for K = 5 nearest neighbors is {:.3f}.".format(GAP_5K))

The GAP score for K = 5 nearest neighbors is 0.147.


In [12]:
### K = 7
def GAP7(N, nn_landmarks_df, ground_truth_df):
    """
    Calculating the GAP metric where the closer the neighbor, the higher the precision, K=7. 
    Param: 
        N (int): Number of values in the test set 
        K (int): Number of neighbors 
        nn_ladnamrk_df (DataFrame): A dataframe of the nearest neighbors' class
        ground_truth_df (DataFrame): The true class of each item from the test set
    Return: 
        GAP_score (int): The calculated GAP_score 
    """
    GAP_score = 0
    for i in range(N): 
        for j in range(7):
             if nn_landmarks_df.loc[i][j+1] == ground_truth_df["landmarks"][i]:
                    if j == 0:
                        GAP_score += 0.4
                    if j == 1:
                        GAP_score += 0.3
                    if j == 2:
                        GAP_score += 0.15
                    if j == 3:
                        GAP_score += 0.05
                    if j == 4:
                        GAP_score += 0.045
                    if j == 5:
                        GAP_score += 0.035
                    if j == 6:
                        GAP_score += 0.02
                        
    return GAP_score / N

N = landmarks_df.shape[0]

GAP_7K_1 = GAP7(N, nn_7_df, landmarks_df)
GAP_7K_2 = GAP7(N, nn_7_df, landmarks_more_classes1_df)
GAP_7K_3 = GAP7(N, nn_7_df, landmarks_more_classes2_df)
GAP_7K_4 = GAP7(N, nn_7_df, landmarks_more_classes3_df)
GAP_7K = GAP_7K_1 + GAP_7K_2 + GAP_7K_3 + GAP_7K_4

print("The GAP score for K = 7 nearest neighbors is {:.3f}.".format(GAP_7K))

The GAP score for K = 7 nearest neighbors is 0.145.
