In [5]:
# same step to LVQ/KNN
#scikit-learn, matplotlib
#https://blogdozouza.wordpress.com/2019/04/11/introducao-ao-algoritmo-k-nearest-neighbour-codigo-python/

import pandas as pd
df = pd.read_csv('bank-additional-full.csv', sep= ';')
df.replace({'yes': 1, 'no': 0}, inplace=True)

df = df[['age', 'job','marital','education','default','housing', 'loan', 'y']].dropna()
df_dummies = pd.get_dummies(
    df[['job','marital','education', 'loan','default','housing']], prefix=['job','marital','education', 'loan', 'default','housing'])
df_dummies['y'] = df['y']

# unknown as mean value
df_dummies.loc[df_dummies.job_unknown == 1, 'job_entrepreneur'] = 1/11.0
df_dummies.loc[df_dummies.job_unknown == 1, 'job_housemaid'] = 1/11.0
df_dummies.loc[df_dummies.job_unknown == 1, 'job_management'] = 1/11.0
df_dummies.loc[df_dummies.job_unknown == 1, 'job_retired'] = 1/11.0
df_dummies.loc[df_dummies.job_unknown == 1, 'job_services'] = 1/11.0
df_dummies.loc[df_dummies.job_unknown == 1, 'job_student'] = 1/11.0
df_dummies.loc[df_dummies.job_unknown == 1, 'job_technician'] = 1/11.0
df_dummies.loc[df_dummies.job_unknown == 1, 'job_unemployed'] = 1/11.0
df_dummies.loc[df_dummies.job_unknown == 1, 'job_admin.'] = 1/11.0
df_dummies.loc[df_dummies.job_unknown == 1, 'job_blue-collar'] = 1/11.0
df_dummies.loc[df_dummies.job_unknown == 1, 'job_self-employed'] = 1/11.0
df_dummies.drop('job_unknown', axis=1, inplace=True)

df_dummies.loc[df_dummies.marital_unknown == 1, 'marital_divorced'] = 1/3.0
df_dummies.loc[df_dummies.marital_unknown == 1, 'marital_married'] = 1/3.0
df_dummies.loc[df_dummies.marital_unknown == 1, 'marital_single'] = 1/3.0
df_dummies.drop('marital_unknown', axis=1, inplace=True)

df_dummies.loc[df_dummies.education_unknown == 1, 'education_basic.4y'] = 1/7.0
df_dummies.loc[df_dummies.education_unknown == 1, 'education_basic.6y'] = 1/7.0
df_dummies.loc[df_dummies.education_unknown == 1, 'education_basic.9y'] = 1/7.0
df_dummies.loc[df_dummies.education_unknown == 1, 'education_high.school'] = 1/7.0
df_dummies.loc[df_dummies.education_unknown == 1, 'education_illiterate'] = 1/7.0
df_dummies.loc[df_dummies.education_unknown == 1, 'education_professional.course'] = 1/7.0
df_dummies.loc[df_dummies.education_unknown == 1, 'education_university.degree'] = 1/7.0
df_dummies.drop('education_unknown', axis=1, inplace=True)

df_dummies.loc[df_dummies.loan_unknown == 1, 'loan_0'] = 0.5
df_dummies.loc[df_dummies.loan_unknown == 1, 'loan_1'] = 0.5
df_dummies.drop('loan_unknown', axis=1, inplace=True)

df_dummies.loc[df_dummies.default_unknown == 1, 'default_0'] = 0.5
df_dummies.loc[df_dummies.default_unknown == 1, 'default_1'] = 0.5
df_dummies.drop('default_unknown', axis=1, inplace=True)

df_dummies.loc[df_dummies.housing_unknown == 1, 'housing_0'] = 0.5
df_dummies.loc[df_dummies.housing_unknown == 1, 'housing_1'] = 0.5
df_dummies.drop('housing_unknown', axis=1, inplace=True)

df_dummies = df_dummies.apply(pd.to_numeric)

corr=df.corr().abs()[['y']]

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test = train_test_split(df_dummies, test_size = 0.6, shuffle=True)

y_train = x_train["y"]
x_train.drop("y", axis=1, inplace=True)

y_test = x_test["y"]
x_test.drop("y", axis=1, inplace=True)

In [10]:
# Implementação antiga:
#LVQ - código em https://towardsdatascience.com/learning-vector-quantization-ed825f8c807d
import numpy as np

def train_lvq(data, labels, num_epochs, learning_rate, validation_data=None, validation_labels=None):
  #Get unique class labels
  num_dims = data.shape[1] #número de dimensões
  labels=labels.astype(int)
  unique_labels =list(set(labels))

  num_protos = len(unique_labels) #tamanho do grupo
  prototypes = np.empty((num_protos, num_dims)) #matriz com o tamanho da labels unicas e número de dimensões
  proto_labels = []

  #initialize prototypes using class means - o protótipo é inicializado com a media da classe
  for i in unique_labels:
    #class_data = data[labels ==i,:]
    class_data=list(map(lambda s: s[0], filter(lambda s: s[1] == i, zip(data.values,labels))))

    #compute class mean
    mean = np.mean(class_data, axis=0)

    prototypes[i] = mean
    proto_labels.append(i)


  #Loop through data set
  for epoch in range (0, num_epochs):
    for fvec, lbl in zip (data.values,labels):
      #compute distance from each prototype to this point
      print(fvec, lbl)
      distances = list(np.sum(np.subtract(fvec, p)**2) for p in prototypes)
      min_dist_index = distances.index(min(distances))

      #determine winner prototype
      winner =prototypes[min_dist_index]
      winner_label=proto_labels[min_dist_index]

      if winner_label == lbl:
        sign = 1
      else:
        sign = -1

      #update winner prototype
      prototypes[min_dist_index] = np.add(prototypes[min_dist_index], np.subtract(fvec, winner) * learning_rate * sign)

    #use validation set to test the performance
    val_err =0
    if validation_labels is not None:
      for fvec, lbl in zip (validation_data.values, validation_labels):
        distances = list(np.sum(np.subtract(fvec,p)**2) for p in prototypes)
        min_dist_index= distances.index(min(distances))

        #determine winner prototype label
        winner_label = proto_labels[min_dist_index]

        #check if labels match
        if not winner_label ==lbl:
          val_err = val_err +1

          val_err = val_err/len(validation_labels)
          print("Epoch" + str(epoch) + ". Validation error: " + str(val_err))
        else:
          print("Epoch" + str(epoch))

  return(prototypes, proto_labels)

In [None]:
(a, b) = train_lvq(x_train, y_train, 30, 0.04, x_test, y_test)

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1.
 0. 0. 1.] 0
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1.
 0. 1. 0.] 0
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1.
 0. 1. 0.] 0
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1.
 0. 1. 0.] 0
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1.
 0. 1. 0.] 0
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1.
 0. 0. 1.] 0
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1.
 0. 0. 1.] 0
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1.
 0. 0. 1.] 0
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1.
 0. 1. 0.] 0
[0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  1.  0.  1.  0.  0.  0.
 0.  0.  0.  1.  0.  0.5 0.5 1.  0. ] 0
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0