# Similarity & kNN

## Euclidean distance

In [2]:
import pandas as pd
from sklearn.metrics import euclidean_distances

x = pd.DataFrame({'age':[23,40],
                  'year':[2,10],
                  'resident':[2,1]})
euclidean_distances(x)

array([[ 0.        , 18.81488772],
       [18.81488772,  0.        ]])

## Similarity

### Part 1: Load Data

In [3]:
# load bank-data "bank-data.csv"
bankData = pd.read_csv('bank-data.csv', sep=';')
bankData

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
4518,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


### Part 2: Preprocess Data

In [4]:
#Binary encoding
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
bankData['default'] = le.fit_transform(bankData['default'])
bankData['housing'] = le.fit_transform(bankData['housing'])
bankData['loan'] = le.fit_transform(bankData['loan'])
bankData['y'] = le.fit_transform(bankData['y'])
bankData

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,0,1787,0,0,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,secondary,0,4789,1,1,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,0,1350,1,0,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,0,1476,1,1,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,0,0,1,0,unknown,5,may,226,1,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,0,-333,1,0,cellular,30,jul,329,5,-1,0,unknown,0
4517,57,self-employed,married,tertiary,1,-3313,1,1,unknown,9,may,153,1,-1,0,unknown,0
4518,57,technician,married,secondary,0,295,0,0,cellular,19,aug,151,11,-1,0,unknown,0
4519,28,blue-collar,married,secondary,0,1137,0,0,cellular,6,feb,129,4,211,3,other,0


In [5]:
#Convert categorical variables into dummy columns
dummies = pd.get_dummies(bankData[['job', 'marital', 'education', 'contact', 'month', 'poutcome']])
bankData = bankData.drop(['job', 'marital', 'education', 'contact', 'month', 'poutcome'], axis=1)
bankData = bankData.join(dummies)
bankData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 49 columns):
age                    4521 non-null int64
default                4521 non-null int64
balance                4521 non-null int64
housing                4521 non-null int64
loan                   4521 non-null int64
day                    4521 non-null int64
duration               4521 non-null int64
campaign               4521 non-null int64
pdays                  4521 non-null int64
previous               4521 non-null int64
y                      4521 non-null int64
job_admin.             4521 non-null uint8
job_blue-collar        4521 non-null uint8
job_entrepreneur       4521 non-null uint8
job_housemaid          4521 non-null uint8
job_management         4521 non-null uint8
job_retired            4521 non-null uint8
job_self-employed      4521 non-null uint8
job_services           4521 non-null uint8
job_student            4521 non-null uint8
job_technician         4521 non-n

#### Data prep

In [6]:
# scaler data
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
bankData_scaled = pd.DataFrame(min_max_scaler.fit_transform(bankData), columns= bankData.columns)

#Train/Test separation (hold out method)
from sklearn.model_selection import train_test_split

x = bankData_scaled.drop(['y'], axis=1)
y = bankData_scaled['y']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
print(pd.crosstab(y_train,columns = 'count'))
print(pd.crosstab(y_test,columns = 'count'))

col_0  count
y           
0.0     2787
1.0      377
col_0  count
y           
0.0     1213
1.0      144


### Part 3: Data processing

#### 1. Distances

In [7]:
# Euclidean distance is suitable for most of numeric data
euclidean_distances(bankData[0:3])

array([[   0.        , 3024.4973136 ,  558.4012894 ],
       [3024.4973136 ,    0.        , 3439.19670854],
       [ 558.4012894 , 3439.19670854,    0.        ]])

In [8]:
# Manhattan distance is less sensitive to outlier
from sklearn.metrics.pairwise import manhattan_distances
manhattan_distances(bankData[0:3])

array([[   0., 3508.,  894.],
       [3508.,    0., 3502.],
       [ 894., 3502.,    0.]])

In [9]:
# Distance with scaling data
euclidean_distances(bankData_scaled[0:3])

array([[0.        , 3.20226295, 3.3410116 ],
       [3.20226295, 0.        , 3.00755954],
       [3.3410116 , 3.00755954, 0.        ]])

#### 2. Look-alike

In [10]:
d = euclidean_distances(bankData_scaled,
                        bankData_scaled[0:1])

result = bankData
result['d'] = d
result.sort_values(by='d')

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,d
0,30,0,1787,0,0,19,79,1,-1,0,...,0,0,0,1,0,0,0,0,1,0.000000
3083,41,0,819,0,0,9,244,3,-1,0,...,0,0,0,0,0,0,0,0,1,1.463590
2041,73,0,154,0,0,15,103,1,-1,0,...,0,0,0,1,0,0,0,0,1,1.555054
108,56,0,3391,0,0,21,243,1,-1,0,...,0,0,0,0,0,0,0,0,1,1.775964
2043,52,0,255,0,1,10,374,3,-1,0,...,0,0,0,0,0,0,0,0,1,1.790613
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4213,30,0,-522,1,1,5,670,2,286,6,...,0,1,0,0,0,1,0,0,0,3.663789
2714,47,0,477,1,0,7,973,1,366,1,...,0,1,0,0,0,1,0,0,0,3.672753
1223,49,0,2370,1,1,17,56,1,103,2,...,0,0,1,0,0,0,1,0,0,3.755431
3652,29,0,1070,1,0,19,30,1,357,1,...,0,1,0,0,0,0,1,0,0,3.764402


#### 3. kNN

<b>kNN model optimized with GridSearchCV</b> <br>
<b>Description:</b> GridSearchCV is the methodology used to exhaustive search over specified parameter values for optimization the result.

In [28]:
from sklearn.model_selection import GridSearchCV

# specify model and parameters
model = KNeighborsClassifier()
n_neighbors= [5,10,15]
n_jobs= [-1]
metric= ['euclidean', 'minkowski']
weights= ['distance', 'uniform']

# fit model with GridSearchCV
param_grid = dict(n_neighbors= n_neighbors, metric= metric, weights= weights, n_jobs= n_jobs)
grid = GridSearchCV(estimator=model, param_grid= param_grid)
grid_result = grid.fit(x_train, y_train)

# show the result
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))



Best: 0.881795 using {'metric': 'euclidean', 'n_jobs': -1, 'n_neighbors': 15, 'weights': 'distance'}


In [30]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=15,
                           metric='euclidean',
                           n_jobs=-1)
clf.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=-1, n_neighbors=15, p=2,
                     weights='uniform')

In [31]:
res = clf.predict(x_test)
pd.crosstab(y_test, res)

col_0,0.0,1.0
y,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1212,1
1.0,138,6


In [32]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

print("Accuracy:\t %.4f" %accuracy_score(y_test, res))
print(classification_report(y_test, res))

Accuracy:	 0.8976
              precision    recall  f1-score   support

         0.0       0.90      1.00      0.95      1213
         1.0       0.86      0.04      0.08       144

    accuracy                           0.90      1357
   macro avg       0.88      0.52      0.51      1357
weighted avg       0.89      0.90      0.85      1357



#### Weighted voting kNN

In [33]:
clf1 = KNeighborsClassifier(n_neighbors=15,
                            metric='euclidean',
                            n_jobs=-1,
                            weights='distance')
clf1.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=-1, n_neighbors=15, p=2,
                     weights='distance')

In [34]:
#test and evaluate
res1 = clf1.predict(x_test)
pd.crosstab(y_test, res1)

col_0,0.0,1.0
y,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1209,4
1.0,137,7


In [35]:
print("Accuracy:\t %.3f" %accuracy_score(y_test, res1))
print(classification_report(y_test, res1))

Accuracy:	 0.896
              precision    recall  f1-score   support

         0.0       0.90      1.00      0.94      1213
         1.0       0.64      0.05      0.09       144

    accuracy                           0.90      1357
   macro avg       0.77      0.52      0.52      1357
weighted avg       0.87      0.90      0.85      1357

