# similar images using nearest neighbor on deep features

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics,model_selection
from sklearn.neighbors import NearestNeighbors

In [2]:
train=pd.read_csv('image_train_data.csv')
test=pd.read_csv('image_test_data.csv')

In [3]:
train.head()

Unnamed: 0,id,image,label,deep_features,image_array
0,24,Height: 32 Width: 32,bird,[0.242872 1.09545 0 0.39363 0 0 11.8949 0 0 0 ...,[73 77 58 71 68 50 77 69 44 120 116 83 125 120...
1,33,Height: 32 Width: 32,cat,[0.525088 0 0 0 0 0 9.94829 0 0 0 0 0 1.01264 ...,[7 5 8 7 5 8 5 4 6 7 4 7 11 5 9 11 5 9 17 11 1...
2,36,Height: 32 Width: 32,cat,[0.566016 0 0 0 0 0 9.9972 0 0 0 1.38345 0 0.7...,[169 122 65 131 108 75 193 196 192 218 221 222...
3,70,Height: 32 Width: 32,dog,[1.1298 0 0 0.778194 0 0.758051 9.83053 0 0 0....,[154 179 152 159 183 157 165 189 162 174 199 1...
4,90,Height: 32 Width: 32,bird,[1.71787 0 0 0 0 0 9.33936 0 0 0 0 0 0.412137 ...,[216 195 180 201 178 160 210 184 164 212 188 1...


In [4]:
train.shape

(2005, 5)

In [5]:
test.shape

(4000, 5)

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2005 entries, 0 to 2004
Data columns (total 5 columns):
id               2005 non-null int64
image            2005 non-null object
label            2005 non-null object
deep_features    2005 non-null object
image_array      2005 non-null object
dtypes: int64(1), object(4)
memory usage: 78.4+ KB


In [7]:
train.describe()

Unnamed: 0,id
count,2005.0
mean,24828.161596
std,14686.132072
min,24.0
25%,12005.0
50%,23963.0
75%,37742.0
max,49970.0


In [8]:
train.groupby('label').count()

Unnamed: 0_level_0,id,image,deep_features,image_array
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
automobile,509,509,509,509
bird,478,478,478,478
cat,509,509,509,509
dog,509,509,509,509


## deep features string to list

In [10]:
def str_to_num(feat):
    feat=feat[1:len(feat)-1]
    return feat.split(' ')

In [11]:
train['deep_features']=train['deep_features'].apply(str_to_num)
train['image_array']=train['image_array'].apply(str_to_num)
test['deep_features']=test['deep_features'].apply(str_to_num)
test['image_array']=test['image_array'].apply(str_to_num)

In [12]:
def to_array(lis):
    l1=[]
    for l in lis:
        l=float(l)
        l1.append(l)
    #l1=np.array(l1)
    return l1

In [13]:
train['deep_features']=train['deep_features'].apply(to_array)
train['image_array']=train['image_array'].apply(to_array)
test['deep_features']=test['deep_features'].apply(to_array)
test['image_array']=test['image_array'].apply(to_array)

In [14]:
train.head()

Unnamed: 0,id,image,label,deep_features,image_array
0,24,Height: 32 Width: 32,bird,"[0.242872, 1.09545, 0.0, 0.39363, 0.0, 0.0, 11...","[73.0, 77.0, 58.0, 71.0, 68.0, 50.0, 77.0, 69...."
1,33,Height: 32 Width: 32,cat,"[0.525088, 0.0, 0.0, 0.0, 0.0, 0.0, 9.94829, 0...","[7.0, 5.0, 8.0, 7.0, 5.0, 8.0, 5.0, 4.0, 6.0, ..."
2,36,Height: 32 Width: 32,cat,"[0.566016, 0.0, 0.0, 0.0, 0.0, 0.0, 9.9972, 0....","[169.0, 122.0, 65.0, 131.0, 108.0, 75.0, 193.0..."
3,70,Height: 32 Width: 32,dog,"[1.1298, 0.0, 0.0, 0.778194, 0.0, 0.758051, 9....","[154.0, 179.0, 152.0, 159.0, 183.0, 157.0, 165..."
4,90,Height: 32 Width: 32,bird,"[1.71787, 0.0, 0.0, 0.0, 0.0, 0.0, 9.33936, 0....","[216.0, 195.0, 180.0, 201.0, 178.0, 160.0, 210..."


## separate 4 categories 

In [11]:
ctrain=train[train['label']=='cat'][['deep_features']]
dtrain=train[train['label']=='dog'][['deep_features']]
btrain=train[train['label']=='bird'][['deep_features']]
atrain=train[train['label']=='automobile'][['deep_features']]
ctrain.head()

Unnamed: 0,deep_features
1,"[0.525088, 0, 0, 0, 0, 0, 9.94829, 0, 0, 0, 0,..."
2,"[0.566016, 0, 0, 0, 0, 0, 9.9972, 0, 0, 0, 1.3..."
10,"[0, 0, 0, 0.643275, 0, 0, 10.1772, 0, 0, 0, 0,..."
15,"[0, 0, 0.510964, 0, 0, 0, 11.2724, 0, 0, 0, 0,..."
17,"[1.38658, 0, 0, 0, 0, 0.182891, 10.3957, 0, 0,..."


## make a matrix for nearest neighbor model

In [12]:
def mat(train):
    l=[]
    for i in range(len(train)):
        a=train.iloc[i]['deep_features']
        l.append(a)
    return l

In [13]:
Ctrain=np.array(mat(ctrain))
Dtrain=np.array(mat(dtrain))
Atrain=np.array(mat(atrain))
Btrain=np.array(mat(btrain))

In [14]:
Dtrain.shape

(509, 4096)

## nearest neighbor model

In [15]:
nbrs1=NearestNeighbors()
cat=nbrs1.fit(Ctrain)



In [16]:
nbrs2=NearestNeighbors()
dog=nbrs2.fit(Dtrain)



In [17]:
nbrs3=NearestNeighbors()
bird=nbrs3.fit(Btrain)



In [18]:
nbrs4=NearestNeighbors()
auto=nbrs4.fit(Atrain)



In [19]:
test.head()

Unnamed: 0,id,image,label,deep_features,image_array
0,0,Height: 32 Width: 32,cat,"[1.13469, 0, 0, 0, 0.0366498, 0, 9.3536, 0, 0,...",[158 112 49 159 111 47 165 116 51 166 118 53 1...
1,6,Height: 32 Width: 32,automobile,"[0.231359, 0, 0, 0, 0, 0.226023, 8.85989, 0, 0...",[160 37 13 185 49 11 209 57 14 217 58 10 230 6...
2,8,Height: 32 Width: 32,cat,"[0, 0, 0.0344192, 0, 0, 0, 11.0375, 0, 0, 0, 0...",[23 19 23 19 21 28 21 16 19 65 47 40 164 131 1...
3,9,Height: 32 Width: 32,automobile,"[0, 0, 0, 0, 0, 0, 11.6065, 0, 0, 0, 1.54379, ...",[217 215 209 210 208 202 205 208 191 199 202 1...
4,12,Height: 32 Width: 32,dog,"[0.322317, 0, 1.24933, 0, 0, 0, 9.10822, 0, 0,...",[91 64 30 82 58 30 87 73 59 89 87 83 95 92 80 ...


## how much cat related to another cat and dog

In [20]:
query=test.iloc[0]['deep_features']
query=np.array(query)
query=query.reshape(-1,4096)

In [21]:
dist1,ind1=cat.kneighbors(query)
print(dist1)
print(ind1)
print(dist1.mean())



[[34.62372239 36.00687995 36.52008096 36.75484798 36.87311533]]
[[181 467 323 269   3]]
36.15572932231885


In [22]:
dist2,ind2=dog.kneighbors(query)
print(dist2)
print(ind2)
print(dist2.mean())

[[37.4642615  37.56668347 37.60472706 37.70655875 38.51132889]]
[[159 129 362 445  67]]
37.77071193352956




## seperate categories in both test and train

In [23]:
a_train=train[train['label']=='automobile']
b_train=train[train['label']=='bird']
c_train=train[train['label']=='cat']
d_train=train[train['label']=='dog']
a_test=test[test['label']=='automobile']
b_test=test[test['label']=='bird']
c_test=test[test['label']=='cat']
d_test=test[test['label']=='dog']

In [24]:
a=a_test[['deep_features']]
b=b_test[['deep_features']]
c=c_test[['deep_features']]
d=d_test[['deep_features']]

In [None]:
a=np.array(mat(a))
b=np.array(mat(b))
c=np.array(mat(c))
d=np.array(mat(d))

## relate all categories distances to each other

In [None]:
a_test['aa']=auto.kneighbors(a,n_neighbors=1)[0]
a_test['ab']=bird.kneighbors(a,n_neighbors=1)[0]
a_test['ac']=cat.kneighbors(a,n_neighbors=1)[0]
a_test['ad']=dog.kneighbors(a,n_neighbors=1)[0]

In [None]:
a_test.head()

In [None]:
b_test['ba']=auto.kneighbors(b,n_neighbors=1)[0]
b_test['bb']=bird.kneighbors(b,n_neighbors=1)[0]
b_test['bc']=cat.kneighbors(b,n_neighbors=1)[0]
b_test['bd']=dog.kneighbors(b,n_neighbors=1)[0]

In [None]:
c_test['ca']=auto.kneighbors(c,n_neighbors=1)[0]
c_test['cb']=bird.kneighbors(c,n_neighbors=1)[0]
c_test['cc']=cat.kneighbors(c,n_neighbors=1)[0]
c_test['cd']=dog.kneighbors(c,n_neighbors=1)[0]

In [None]:
d_test['da']=auto.kneighbors(d,n_neighbors=1)[0]
d_test['db']=bird.kneighbors(d,n_neighbors=1)[0]
d_test['dc']=cat.kneighbors(d,n_neighbors=1)[0]
d_test['dd']=dog.kneighbors(d,n_neighbors=1)[0]

## distance of dog-dog vs dog-other categories

In [None]:
p=(d_test['dd']<d_test['da']) & (d_test['dd']<d_test['db']) & (d_test['dd']<d_test['dc'])
p=p.astype(int)
p

In [None]:
p.shape

In [None]:
p.sum()

In [None]:
# accuracy for dog-dog vs dog-others
accuracy=678/1000
accuracy