# Similarity & kNN

## Euclidean distance

In [None]:
import pandas as pd
from sklearn.metrics import euclidean_distances

x = pd.DataFrame({'age':[23,40],
                  'year':[2,10],
                  'resident':[2,1]})
euclidean_distances(x)

array([[ 0.        , 18.81488772],
       [18.81488772,  0.        ]])

## Similarity

### Part 1: Load Data

In [None]:
# load bank-data "bank-data.csv"
bankData = pd.read_csv('/content/bank-data.csv', sep = ';')
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


### Part 2: Preprocess Data

In [None]:
#Binary encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

bankData['y'] = le.fit_transform(bankData['y'])
bankData['housing'] = le.fit_transform(bankData['housing'])
bankData['default'] = le.fit_transform(bankData['default'])
bankData['loan'] = le.fit_transform(bankData['loan'])

bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,0,1787,0,0,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,secondary,0,4789,1,1,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,0,1350,1,0,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,0,1476,1,1,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,0,0,1,0,unknown,5,may,226,1,-1,0,unknown,0


In [None]:
#Convert categorical variables into dummy columns
bankData = pd.concat([bankData,pd.get_dummies(bankData['job'],prefix='job')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['marital'],prefix='marital')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['education'],prefix='education')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['contact'],prefix='contact')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['month'],prefix='month')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['poutcome'],prefix='poutcome')],axis=1)

bankData = bankData.drop(columns=['job', 'marital', 'education', 'contact', 'month', 'poutcome'])

In [None]:
bankData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 49 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   age                  4521 non-null   int64
 1   default              4521 non-null   int64
 2   balance              4521 non-null   int64
 3   housing              4521 non-null   int64
 4   loan                 4521 non-null   int64
 5   day                  4521 non-null   int64
 6   duration             4521 non-null   int64
 7   campaign             4521 non-null   int64
 8   pdays                4521 non-null   int64
 9   previous             4521 non-null   int64
 10  y                    4521 non-null   int64
 11  job_admin.           4521 non-null   uint8
 12  job_blue-collar      4521 non-null   uint8
 13  job_entrepreneur     4521 non-null   uint8
 14  job_housemaid        4521 non-null   uint8
 15  job_management       4521 non-null   uint8
 16  job_retired          452

#### Data prep

In [None]:
#Train/Test separation (hold out method)
from sklearn.model_selection import train_test_split
y = bankData['y']
X = bankData.drop(columns='y')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
X_train.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
3543,39,0,6,1,0,18,122,3,357,4,...,0,0,1,0,0,0,1,0,0,0
4078,30,0,562,0,0,21,137,4,-1,0,...,0,0,0,0,0,0,0,0,0,1
3102,61,0,313,0,1,3,42,1,-1,0,...,0,0,0,0,0,0,0,0,0,1
2477,53,0,732,0,0,27,759,2,-1,0,...,0,0,0,0,1,0,0,0,0,1
2940,61,0,1191,0,0,21,214,4,-1,0,...,0,0,1,0,0,0,0,0,0,1


### Part 3: Data processing

#### 1. Distances

In [None]:
# Euclidean distance is suitable for most of numeric data
euclidean_distances(bankData[0:3])

array([[   0.        , 3024.4973136 ,  558.4012894 ],
       [3024.4973136 ,    0.        , 3439.19670854],
       [ 558.4012894 , 3439.19670854,    0.        ]])

In [None]:
# Manhattan distance is less sensitive to outlier
from sklearn.metrics.pairwise import manhattan_distances
manhattan_distances(bankData[0:3])

array([[   0., 3508.,  894.],
       [3508.,    0., 3502.],
       [ 894., 3502.,    0.]])

In [None]:
# Distance with scaling data
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()

bankData_scaled = pd.DataFrame(
                    mms.fit_transform(bankData),
                    columns=bankData.columns)

euclidean_distances(bankData_scaled[0:3])

array([[0.        , 3.20226295, 3.3410116 ],
       [3.20226295, 0.        , 3.00755954],
       [3.3410116 , 3.00755954, 0.        ]])

#### 2. Look-alike

In [None]:
d = euclidean_distances(bankData_scaled,
                        bankData_scaled[0:1])

result = bankData
result['d'] = d
result.sort_values(by='d')

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,d
0,30,0,1787,0,0,19,79,1,-1,0,...,0,0,0,1,0,0,0,0,1,0.000000
3083,41,0,819,0,0,9,244,3,-1,0,...,0,0,0,0,0,0,0,0,1,1.463590
2041,73,0,154,0,0,15,103,1,-1,0,...,0,0,0,1,0,0,0,0,1,1.555054
108,56,0,3391,0,0,21,243,1,-1,0,...,0,0,0,0,0,0,0,0,1,1.775964
2043,52,0,255,0,1,10,374,3,-1,0,...,0,0,0,0,0,0,0,0,1,1.790613
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4213,30,0,-522,1,1,5,670,2,286,6,...,0,1,0,0,0,1,0,0,0,3.663789
2714,47,0,477,1,0,7,973,1,366,1,...,0,1,0,0,0,1,0,0,0,3.672753
1223,49,0,2370,1,1,17,56,1,103,2,...,0,0,1,0,0,0,1,0,0,3.755431
3652,29,0,1070,1,0,19,30,1,357,1,...,0,1,0,0,0,0,1,0,0,3.764402


#### 3. kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=5,
                           metric='euclidean',
                           n_jobs=-1)
clf.fit(X_train,y_train)

In [None]:
res = clf.predict(X_test)
pd.crosstab(y_test, res)

col_0,0,1
y,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1536,53
1,180,40


In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

print("Accuracy:\t %.3f" %accuracy_score(y_test, res))
print(classification_report(y_test, res))

Accuracy:	 0.871
              precision    recall  f1-score   support

           0       0.90      0.97      0.93      1589
           1       0.43      0.18      0.26       220

    accuracy                           0.87      1809
   macro avg       0.66      0.57      0.59      1809
weighted avg       0.84      0.87      0.85      1809



#### Weighted voting kNN (n = 5)

In [None]:
clf1 = KNeighborsClassifier(n_neighbors=5,
                            metric='euclidean',
                            n_jobs=-1,
                            weights='distance')
clf1.fit(X_train,y_train)

In [None]:
#test and evaluate
res1 = clf1.predict(X_test)
pd.crosstab(y_test, res1)

col_0,0,1
y,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1518,71
1,176,44


In [None]:
print("Accuracy:\t %.3f" %accuracy_score(y_test, res1))
print(classification_report(y_test, res1))

Accuracy:	 0.863
              precision    recall  f1-score   support

           0       0.90      0.96      0.92      1589
           1       0.38      0.20      0.26       220

    accuracy                           0.86      1809
   macro avg       0.64      0.58      0.59      1809
weighted avg       0.83      0.86      0.84      1809



#### Weighted voting kNN (n = 10)

In [None]:
clf1 = KNeighborsClassifier(n_neighbors=10,
                            metric='euclidean',
                            n_jobs=-1,
                            weights='distance')
clf1.fit(X_train,y_train)

In [None]:
#test and evaluate
res1 = clf1.predict(X_test)
pd.crosstab(y_test, res1)

col_0,0,1
y,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1548,41
1,194,26


In [None]:
print("Accuracy:\t %.3f" %accuracy_score(y_test, res1))
print(classification_report(y_test, res1))

Accuracy:	 0.870
              precision    recall  f1-score   support

           0       0.89      0.97      0.93      1589
           1       0.39      0.12      0.18       220

    accuracy                           0.87      1809
   macro avg       0.64      0.55      0.56      1809
weighted avg       0.83      0.87      0.84      1809

