In [1]:
# imports for code 
import pandas as pd
import numpy as np 

# load the following csv files as dataframe 
url_train = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/data/train/train.csv' 
url_test ='https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/data/test/test.csv'
url_test_more_classes1 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/data/test/more_classes/test_more_classes1.csv'
url_test_more_classes2 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/data/test/more_classes/test_more_classes2.csv'
url_test_more_classes3 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/data/test/more_classes/test_more_classes3.csv'
url_nn_3 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/feature_extraction/results_csv/nearest_neighbor_embedded_test_K%3D3_landmarks_only.csv'
url_nn_5 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/feature_extraction/results_csv/nearest_neighbor_embedded_test_K%3D5.csv'
url_nn_7 = 'https://raw.githubusercontent.com/matankleiner/Identify-Known-Sites-in-Photo-Album/master/feature_extraction/results_csv/nearest_neighbor_embedded_test_K%3D7_landmarks_only.csv'

train_df = pd.read_csv(url_train)
test_df = pd.read_csv(url_test) 
test_more_classes1_df = pd.read_csv(url_test_more_classes1)
test_more_classes2_df = pd.read_csv(url_test_more_classes2)
test_more_classes3_df = pd.read_csv(url_test_more_classes3)
nn_3_df = pd.read_csv(url_nn_3)
nn_5_df = pd.read_csv(url_nn_5)
nn_7_df = pd.read_csv(url_nn_7)

In [2]:
def change_df(df): 
    """
    Changing the dataframe so it will be easier to work with. 
    Param: 
        df (pd.DataFrame): The dataframe to change 
    Return: 
        df (pd.DataFrame): The chnaged dataframe 
    """
    df = df.drop("Unnamed: 0", axis=1)
    df.insert(0, "id", test_df["id"], True) 
    return df 

nn_3_df = change_df(nn_3_df)
nn_5_df = change_df(nn_5_df)
nn_7_df = change_df(nn_7_df)

In [3]:
def train_indx_to_class(nn_df, train_df, K):
    """
    The nn_df hold the index of the matching neighbor in the train set, this function replace it with the matching class 
    Param: 
        nn_df (DataFrame): The nearest neighbors dataframe
        train_df (DataFrame): The train set dataframe
        K (int): The number of nearest neighbors 
    Return: 
        nn_df (DataFrame): Thenearest neighbors dataframe, each column k [k is in (0,1,2...K-1)] contain the
                           class of the (k+1) nearest neighbor
    """
    for k in range(K): 
        col_to_replace = train_df.loc[nn_df[str(k)]]["landmark_id"]
        nn_df[str(k)] = col_to_replace.values
    
    return nn_df 

nn_3_df = train_indx_to_class(nn_3_df, train_df, 3)
nn_5_df = train_indx_to_class(nn_5_df, train_df, 5)
nn_7_df = train_indx_to_class(nn_7_df, train_df, 7)

In [4]:
# create a series of all the indices where there is a landmark 
landmark_inidices = test_df['landmarks'] != 0 
landmark_inidices = landmark_inidices[landmark_inidices].index

# create a smaller dataframe of the test set that made out only of the landmarks 
landmarks_df = test_df.loc[landmark_inidices]
landmarks_df = landmarks_df.reset_index()
nn_5_df = nn_5_df.reset_index()
nn_5_df = nn_5_df.drop("index", axis=1)
landmarks_more_classes1_df = test_more_classes1_df.loc[landmark_inidices]
landmarks_more_classes1_df = landmarks_more_classes1_df.reset_index()
landmarks_more_classes2_df = test_more_classes2_df.loc[landmark_inidices]
landmarks_more_classes2_df = landmarks_more_classes2_df.reset_index()
landmarks_more_classes3_df = test_more_classes3_df.loc[landmark_inidices]
landmarks_more_classes3_df = landmarks_more_classes3_df.reset_index()

nn_5_df = nn_5_df.loc[landmark_inidices] 
nn_5_df = nn_5_df.reset_index()
nn_5_df = nn_5_df.drop("index", axis=1)

In [5]:
# create a dataframe of the landmark id, most frequent class and second most frequent class for K=3 neighbors 
nn_3_df_no_id = nn_3_df.drop("id", axis=1) 
most_freq3 = []
second_most_freq3 = []
for i in range (nn_3_df_no_id.shape[0]):
    row3 = nn_3_df_no_id.loc[i]
    freq_class3 = row3.value_counts()
    freq_class3 = pd.DataFrame(freq_class3)
    freq_class3 = freq_class3.reset_index()
    if freq_class3[i][0] == 1:
        most_freq3.append(nn_3_df_no_id["0"][i])
        second_most_freq3.append(nn_3_df_no_id["1"][i])
    elif freq_class3[i][0] == 3:
        most_freq3.append(nn_3_df_no_id["0"][i])
        second_most_freq3.append(0)
    elif freq_class3[i][0] == 2:
        most_freq3.append(freq_class3["index"][0])
        second_most_freq3.append(freq_class3["index"][1])
        
nn_3_freq_class_df = pd.DataFrame(nn_3_df["id"])
nn_3_freq_class_df['most freq'] = pd.DataFrame(most_freq3)
nn_3_freq_class_df['second most freq'] = pd.DataFrame(second_most_freq3)

most_freq_series_3n_1 = landmarks_df["landmarks"] == nn_3_freq_class_df['most freq'] 
most_freq_series_3n_2 = landmarks_more_classes1_df["landmarks"] == nn_3_freq_class_df['most freq']
most_freq_series_3n_3 = landmarks_more_classes2_df["landmarks"] == nn_3_freq_class_df['most freq']
most_freq_series_3n_4 = landmarks_more_classes3_df["landmarks"] == nn_3_freq_class_df['most freq']
 
second_most_freq_serires_3n_1 = landmarks_df["landmarks"] == nn_3_freq_class_df['second most freq'] 
second_most_freq_serires_3n_2 = landmarks_more_classes1_df["landmarks"] == nn_3_freq_class_df['second most freq'] 
second_most_freq_serires_3n_3 = landmarks_more_classes2_df["landmarks"] == nn_3_freq_class_df['second most freq'] 
second_most_freq_serires_3n_4 = landmarks_more_classes3_df["landmarks"] == nn_3_freq_class_df['second most freq'] 

most_freq_3_indices = most_freq_series_5n_1[most_freq_series_5n_1].index.append\
                      (most_freq_series_5n_2[most_freq_series_5n_2].index).append\
                      (most_freq_series_5n_3[most_freq_series_5n_3].index).append\
                      (most_freq_series_5n_4[most_freq_series_5n_4].index)
second_most_freq_5_indices = second_most_freq_serires_5n_1[second_most_freq_serires_5n_1].index.append\
                             (second_most_freq_serires_5n_2[second_most_freq_serires_5n_2].index).append\
                             (second_most_freq_serires_5n_3[second_most_freq_serires_5n_3].index).append\
                             (second_most_freq_serires_5n_4[second_most_freq_serires_5n_4].index)
predicted5 = len(most_freq_5_indices.append(second_most_freq_5_indices).unique())

predicted3 = len(most_freq_series3[most_freq_series3]) + len([second_most_freq_series3[second_most_freq_series3]]) 

print("For K=3 neighbors, the number of landmarks that are part of the most frequent class or the second most frequent"
      " class is    {}, which is {:.2f}%  accuracy.".format(predicted3, predicted3/landmarks_df.shape[0]*100))

NameError: name 'most_freq_series_5n_1' is not defined

In [None]:
# create a dataframe of the landmark id, most frequent class and second most frequent class for K=5 neighbors 
nn_5_df_no_id = nn_5_df.drop("id", axis=1) 
most_freq5 = []
second_most_freq5 = []
for i in range (nn_5_df_no_id.shape[0]):
    row5 = nn_5_df_no_id.loc[i]
    freq_class5 = row5.value_counts()
    freq_class5 = pd.DataFrame(freq_class5)
    freq_class5 = freq_class5.reset_index()
    if freq_class5[i][0] == 1:
        most_freq5.append(nn_5_df_no_id["0"][i])
        second_most_freq5.append(nn_5_df_no_id["1"][i])
    elif freq_class5[i][0] == 5:
        most_freq5.append(nn_5_df_no_id["0"][i])
        second_most_freq5.append(0)
    elif freq_class5[i][0] == 4:
        most_freq5.append(freq_class5["index"][0])
        second_most_freq5.append(freq_class5["index"][1])
    elif freq_class5[i][0] == 3:
        most_freq5.append(freq_class5["index"][0])
        second_most_freq5.append(freq_class5["index"][1])
    elif freq_class5[i][0] == 2 and freq_class5[i][1] == 1:
        most_freq5.append(freq_class5["index"][0])
        for k in range(5):
            if nn_5_df_no_id[str(k)][i] != freq_class5["index"][0]:
                second_most_freq5.append(nn_5_df_no_id[str(k)][i])
    elif freq_class5[i][0] == 2 and freq_class5[i][1] == 2: 
        most_freq5.append(freq_class5["index"][0])
        second_most_freq5.append(freq_class5["index"][1])
        
nn_5_freq_class_df = pd.DataFrame(nn_5_df["id"])
nn_5_freq_class_df['most freq'] = pd.DataFrame(most_freq5)
nn_5_freq_class_df['second most freq'] = pd.DataFrame(second_most_freq5)

most_freq_series_5n_1 = landmarks_df["landmarks"] == nn_5_freq_class_df['most freq'] 
most_freq_series_5n_2 = landmarks_more_classes1_df["landmarks"] == nn_5_freq_class_df['most freq']
most_freq_series_5n_3 = landmarks_more_classes2_df["landmarks"] == nn_5_freq_class_df['most freq']
most_freq_series_5n_4 = landmarks_more_classes3_df["landmarks"] == nn_5_freq_class_df['most freq']

second_most_freq_serires_5n_1 = landmarks_df["landmarks"] == nn_5_freq_class_df['second most freq'] 
second_most_freq_serires_5n_2 = landmarks_more_classes1_df["landmarks"] == nn_5_freq_class_df['second most freq'] 
second_most_freq_serires_5n_3 = landmarks_more_classes2_df["landmarks"] == nn_5_freq_class_df['second most freq'] 
second_most_freq_serires_5n_4 = landmarks_more_classes3_df["landmarks"] == nn_5_freq_class_df['second most freq'] 

most_freq_5_indices = most_freq_series_5n_1[most_freq_series_5n_1].index.append\
                      (most_freq_series_5n_2[most_freq_series_5n_2].index).append\
                      (most_freq_series_5n_3[most_freq_series_5n_3].index).append\
                      (most_freq_series_5n_4[most_freq_series_5n_4].index)
second_most_freq_5_indices = second_most_freq_serires_5n_1[second_most_freq_serires_5n_1].index.append\
                             (second_most_freq_serires_5n_2[second_most_freq_serires_5n_2].index).append\
                             (second_most_freq_serires_5n_3[second_most_freq_serires_5n_3].index).append\
                             (second_most_freq_serires_5n_4[second_most_freq_serires_5n_4].index)
predicted5 = len(most_freq_5_indices.append(second_most_freq_5_indices).unique())

print("For K=5 neighbors, the number of landmarks that are part of the most frequent class or the second most frequent"
      " class is    {}, which is {:.2f}%  accuracy.".format(predicted5, predicted5/landmarks_df.shape[0]*100))

In [None]:
cntr3 = 0
for i in range(landmarks_df.shape[0]): 
    if landmarks_df["landmarks"][i] == nn_3_df["0"][i] or landmarks_df["landmarks"][i] == nn_3_df["1"][i] \
       or landmarks_df["landmarks"][i] == nn_3_df["2"][i]:
        cntr3 += 1

print("For K=3 neighbors, the number of landmarks that their class is appeared in at least one of the nearest neighbors"
      " is {},    which is {:.2f}%  out of all the landamrks.".format(cntr3, cntr3/landmarks_df.shape[0]*100))

cntr5 = 0
for i in range(landmarks_df.shape[0]): 
    if landmarks_df["landmarks"][i] == nn_5_df["0"][i] or landmarks_df["landmarks"][i] == nn_5_df["1"][i] \
       or landmarks_df["landmarks"][i] == nn_5_df["2"][i] or landmarks_df["landmarks"][i] == nn_5_df["3"][i] \
       or landmarks_df["landmarks"][i] == nn_5_df["4"][i]: 
        cntr5 += 1

print("\nFor K=5 neighbors, the number of landmarks that their class is appeared in at least one of the nearest neighbors"
      " is {},    which is {:.2f}%  out of all the landamrks.".format(cntr5, cntr5/landmarks_df.shape[0]*100))

cntr7 = 0
for i in range(landmarks_df.shape[0]): 
    if landmarks_df["landmarks"][i] == nn_7_df["0"][i] or landmarks_df["landmarks"][i] == nn_7_df["1"][i] \
       or landmarks_df["landmarks"][i] == nn_7_df["2"][i] or landmarks_df["landmarks"][i] == nn_7_df["3"][i] \
       or landmarks_df["landmarks"][i] == nn_7_df["4"][i] or landmarks_df["landmarks"][i] == nn_7_df["5"][i] \
       or landmarks_df["landmarks"][i] == nn_7_df["6"][i]: 
        cntr7 += 1

print("\nFor K=7 neighbors, the number of landmarks that their class is appeared in at least one of the nearest neighbors"
      " is {},    which is {:.2f}%  out of all the landamrks.".format(cntr7, cntr7/landmarks_df.shape[0]*100))