https://docs.google.com/forms/d/e/1FAIpQLSdI6cWyyHCBkn2h0lUXvZM9iGNX3y1QMRbKT0iSVsrm8Qhx_w/viewform?hr_submission=ChcIrpDc8gMSDwi72JqF0wgSBgiGjdbHJxAB

In [1]:
import pandas as pd
import numpy as np

import urllib.request
import io
import time
import copy


from scipy.io import arff
from sklearn.model_selection import KFold
from collections import Counter

from sklearn.preprocessing import MinMaxScaler
import re
import random

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 

# Preparação de dados

* extração do dataframe

In [2]:
url = 'http://promise.site.uottawa.ca/SERepository/datasets/kc2.arff'
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
kc2 = pd.DataFrame(data)
kc2 = kc2.sample(frac=1, random_state=20)
kc2.reset_index(drop=True, inplace=True)

# Algoritmos

In [3]:
def  euclidian_distance(x1, x2):
    arr = []
    for i in range(len(x1)-1):
        arr.append((x1[i] - x2[i])** 2)
    return np.sqrt(np.sum(arr))


In [4]:
class KNN:

    def __init__(self, k):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y


    def predict(self, X):
        predicted_labels = []
        for _, row in X.iterrows():
            predicted_labels.append(self.predict_func(row))

        return np.array(predicted_labels)
        
    def predict_func(self, x):
        distances = []
        for _, row in self.X_train.iterrows():
            distances.append(euclidian_distance(x, row))
          
        k_indices = np.argsort(distances)[:self.k]        
        k_nearest_labels = [self.y_train.iloc[i] for i in k_indices]

        unique, counts = np.unique(k_nearest_labels, return_counts=True)
        predicted = unique[counts.argmax()]

        return predicted

In [15]:
def attr_class(df):
    Y = df["problems"]
    X = df.drop(columns=["problems"]) 
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    x_scaled = scaler.fit_transform(X)
    X = pd.DataFrame(x_scaled)
    
    Y = Y.apply(str).str.replace("b|'", '')

    
    return X, Y

In [6]:
def _attr_class(df):
    
    x = []
    y = []
    for row in df:
        x.append(row[:-1])
        y.append(row[-1])

    
    return x,y

### LVQ 1

In [7]:
from sklearn.neighbors import KNeighborsClassifier


In [8]:
class LVQ1:   
    
    def __init__(self, n_prototypes):
        self.n_prototypes  = n_prototypes
        self._epochs        = 10
        self._lrate        = 0.25

    
    def random_prototype(self, train):
        n_records  = train.shape[0]
        n_features = train.shape[1]
        prototypes  = []
        records    = []
        random.seed(10)
        for i in range(self.n_prototypes):
            records.append(random.randint(0, n_records-1))
        
        for r in records:
            prototype = [train.iloc[r][i] for i in range(n_features)]
            prototypes.append(prototype)
        
        return prototypes, records
    

    def trainPrototypes(self,train):
        prototypes = []
        records = []
        aux = train
        for i in range(self.n_prototypes):
            prototypes, records = self.random_prototype(train)
        for i in records:
            aux = aux.drop(aux.loc[aux.index==i].index)
        aux = aux.reset_index(drop=True)
        
        x_train, y_train = attr_class(aux)
        x_proto, y_proto = _attr_class(prototypes)

        knn = KNeighborsClassifier(n_neighbors=1)
        knn.fit(x_train, y_train)
        for epoch in range(self._epochs):            
            rate  = self._lrate * (1.0 - (epoch / float(self._epochs)))
            predicted_classes = knn.predict(x_proto) 
            real_instances = knn.kneighbors(X=x_proto, n_neighbors=1, return_distance=False)
            for idx in range(len(x_proto)):
                prototype = x_proto[idx] # prototipo (ta sendo alterado o valor)
                proto_class = y_proto[idx] #classe do prototipo 
                
                predicted_class = predicted_classes[idx] #classe prevista pra esse prototipo 
                real_instance_index = real_instances[idx][0] #indice instancia real daquele prototipo 
                real_instance = x_train.iloc[real_instance_index] # instancia real
                
                for p in range(len(prototype)): #passa por cada coluna do prototipo
                    proto_val = prototype[p]
                    real_val  = real_instance[p]
                    error = rate * (real_val - proto_val)
                    if proto_class == predicted_class:
                        prototype[p] += error
                    else:
                        prototype[p] -= error

       
        return pd.DataFrame(prototypes, columns = train.columns)

# lvq1 = LVQ1(10)
# df  = lvq1.trainPrototypes(kc2)
# df 

### LVQ 2.1

In [9]:
class LVQ2_1:   
    
    def __init__(self, n_prototypes, window):
        self.n_prototypes  = n_prototypes
        self._epochs        = 10
        self._lrate        = 0.25
        self._window        = window

    
    def inside(self):
        return (1.0 - self._window) / (1.0 + self._window)
    
    def window(self, d1, d2):
        a  = d1/d2
        b  = d2/d1
        minimum = min(a,b)
        return minimum > self.inside()
    
    def updatePrototypes(self, prototype, real, same_class, rate):
        for idx in range(21):
            proto_val = prototype[idx]
            real_val  = real[idx]
            error = rate * (real_val - proto_val)
            if same_class:
                prototype[idx] += error
            else:
                prototype[idx] -= error
        return prototype
    
    def trainPrototypes(self,train, df_prototypes):
        x_train, y_train = attr_class(train)
        x_proto, y_proto = attr_class(df_prototypes)
        knn = KNeighborsClassifier(n_neighbors=2)
        knn.fit(x_proto, y_proto)
        for epoch in range(self._epochs):
            rate  = self._lrate * (1.0 - (epoch / float(self._epochs)))
            x_neighbors = knn.kneighbors(X=x_train, n_neighbors=2, return_distance=False)
            for idx, _ in x_train.iterrows():
                x_instance = x_train.iloc[idx]
                x_class = y_train.iloc[idx]
                
                #mais próximo
                n1_idx   = x_neighbors[idx][0]
                n1       = x_proto.iloc[n1_idx]
                n1       = n1[:21] ##tá adicionando o indice como uma coluna na iteração seguinte
                n1_class = y_proto[n1_idx]
                n1_dist  = euclidian_distance(n1, x_instance) 
                
                #segundo mais próximo
                n2_idx   = x_neighbors[idx][0]
                n2       = x_proto.iloc[n2_idx]
                n2       = n2[:21]
                n2_class = y_proto[n2_idx]
                n2_dist  = euclidian_distance(n2, x_instance)
                
                insideWindow = self.window(n1_dist, n2_dist)
                if insideWindow or n1_class != n2_class:
                    x_proto.iloc[n1_idx] = self.updatePrototypes(n1, x_instance, n1_class == x_class, rate)
                    x_proto.iloc[n2_idx] = self.updatePrototypes(n2, x_instance, n2_class == x_class, rate)
        prototypes = pd.concat([x_proto, y_proto], axis=1)
        prototypes.columns = kc2.columns
        return prototypes 
    
# lvq2 = LVQ2_1(100, 0.25)
# df2  = lvq2.trainPrototypes(kc2, df)
# df2

### LVQ 3

In [10]:
class LVQ3:   
    
    def __init__(self, n_prototypes, window):
        self.n_prototypes  = n_prototypes
        self._epochs       = 10
        self._lrate        = 0.25
        self._window       = window
        self._epsilon      = 0.75
        
    def getNeighborPrototype(self, prototypes, row):
        distances  = []
        final_dist = []
        for _,proto in prototypes.iterrows():
            dist = euclidian_distance(proto, row)
            distances.append((proto, dist))
        distances.sort(key=lambda tup: tup[1])
        final_dist.append(distances[1])
        final_dist.append(distances[2])
        return final_dist
    
    def inside(self):
        return (1.0 - self._window) / (1.0 + self._window)
    
    def window(self, d1, d2):
        a  = d1/d2
        b  = d2/d1
        minimum = min(a,b)
        return minimum > self.inside()
    
    def updatePrototypes(self, prototype, real, same_class, rate):
        for idx in range(21):
            proto_val = prototype[idx]
            real_val  = real[idx]
            error = rate * (real_val - proto_val)
            if same_class:
                prototype[idx] += error
            else:
                prototype[idx] -= error
        return prototype
    
    def trainPrototypes(self,train, df_prototypes):
        x_train, y_train = attr_class(train)
        x_proto, y_proto = attr_class(df_prototypes)
        knn = KNeighborsClassifier(n_neighbors=2)
        knn.fit(x_proto, y_proto)        
        for epoch in range(self._epochs):
            rate = self._lrate * (1.0 - (epoch / float(self._epochs)))
            x_neighbors = knn.kneighbors(X=x_train, n_neighbors=2, return_distance=False)
            for idx, _ in x_train.iterrows():
                x_instance = x_train.iloc[idx]
                x_class = y_train.iloc[idx]

                #mais próximo
                n1_idx   = x_neighbors[idx][0]
                n1       = x_proto.iloc[n1_idx]
                n1       = n1[:21] ##tá adicionando o indice como uma coluna na iteração seguinte
                n1_class = y_proto[n1_idx]
                n1_dist  = euclidian_distance(n1, x_instance) 

                #segundo mais próximo
                n2_idx   = x_neighbors[idx][0]
                n2       = x_proto.iloc[n2_idx]
                n2       = n2[:21]
                n2_class = y_proto[n2_idx]
                n2_dist  = euclidian_distance(n2, x_instance)

                insideWindow = self.window(n1_dist, n2_dist)
                if insideWindow:
                    if n1_class != n2_class:
                        x_proto.iloc[n1_idx] = self.updatePrototypes(n1, x_instance, n1_class == x_class, rate)
                        x_proto.iloc[n2_idx] = self.updatePrototypes(n2, x_instance, n2_class == x_class, rate)
                    else:
                        rate = rate * self._epsilon
                        x_proto.iloc[n1_idx] = self.updatePrototypes(n1, x_instance, True, rate)
                        x_proto.iloc[n2_idx] = self.updatePrototypes(n2, x_instance, True, rate)

        prototypes = pd.concat([x_proto, y_proto], axis=1)
        prototypes.columns = kc2.columns
        return prototypes
    
# lvq3 = LVQ3(10, 0.3)
# res  = lvq3.trainPrototypes(kc1, df)
# df3

In [11]:
# lvq3 = LVQ3(10, 0.3, 0.5)
# res  = lvq3.trainPrototypes(df)
# df3 = pd.DataFrame(res, columns = kc2.columns)
# df3

## KNN 


In [16]:
def run(df, title):
    kf = KFold(n_splits=5)
    accuracies_1 = []
    accuracies_3   = []
    index = 0
    for train, test in kf.split(df):
        #separação conj de teste atributos e classe
        attr, df_class = attr_class(df)

        # 1-NN
        knn  = KNN(1)
        knn.fit(attr, df_class)
        predictions_simples = knn.predict(attr.iloc[test])
        acc_simples = (np.sum(predictions_simples == df_class.iloc[test]) / len(test)) * 100
        accuracies_1.append(acc_simples)

        # 3-NN
        knn  = KNN(3)
        knn.fit(attr, df_class)
        predictions_simples = knn.predict(attr.iloc[test])
        acc_simples = (np.sum(predictions_simples == df_class.iloc[test]) / len(test)) * 100
        accuracies_3.append(acc_simples)

    return accuracies_1, accuracies_3

accuracies_1, accuracies_3 = run(kc2, 'KC2')


In [21]:
def run(df, title, accuracies_1, accuracies_3):
    kf = KFold(n_splits=5)
    results = []
    n_proto = [10, 50, 100, 200]
    for n in n_proto:
        for k in [1,3]:
            accuracies_simples = []
            accuracies_lvq1   = []
            accuracies_lvq2   = []
            accuracies_lvq3   = []
            index = 0
            for train, test in kf.split(df):
                #separação conj de teste atributos e classe
                attr, df_class = attr_class(df)

                # KNN com conjunto normal
                accuracies_simples = accuracies_1 if k == 1 else accuracies_3
                
                #definição do dataset de LVQ1
                lvq1 = LVQ1(n)
                aux = df.iloc[train]
                aux.reset_index(drop=True, inplace=True)
                res  = lvq1.trainPrototypes(aux)
                lvq1_df  = pd.DataFrame(res, columns = df.columns)
                
                #separação do treinamento LVQ1 atributos e classe
                X, Y = attr_class(lvq1_df)
                
                #KNN com conjunto do LVQ1
                knn  = KNN(k)
                knn.fit(X, Y)
                predictions_lvq1 = knn.predict(attr.iloc[test])
                acc_lvq1 = (np.sum(predictions_lvq1 == df_class.iloc[test]) / len(test)) * 100
                accuracies_lvq1.append(acc_lvq1)
                
                #definição do dataset de LVQ2.1
                lvq2 = LVQ2_1(n, 0.5)
                res  = lvq2.trainPrototypes(aux,lvq1_df)
                lvq2_df  = pd.DataFrame(res, columns = df.columns)
                
                #separação do treinamento LVQ2.1 atributos e classe
                X, Y = attr_class(lvq2_df)
                
                #KNN com conjunto do LVQ2.1
                knn  = KNN(k)
                knn.fit(X, Y)
                predictions_lvq2 = knn.predict(attr.iloc[test])
                acc_lvq2 = (np.sum(predictions_lvq2 == df_class.iloc[test]) / len(test)) * 100
                accuracies_lvq2.append(acc_lvq2)
                
                #definição do dataset de LVQ3
                lvq3 = LVQ3(n, 0.5)
                res  = lvq3.trainPrototypes(aux,lvq1_df)
                lvq3_df  = pd.DataFrame(res, columns = df.columns)
                
                #separação do treinamento LVQ3 atributos e classe
                X, Y = attr_class(lvq3_df)
                
                #KNN com conjunto do LVQ3
                knn  = KNN(k)
                knn.fit(X, Y)
                predictions_lvq3 = knn.predict(attr.iloc[test])
                acc_lvq3 = (np.sum(predictions_lvq3 == df_class.iloc[test]) / len(test)) * 100
                accuracies_lvq3.append(acc_lvq3)
            
            print("({}, {}): simples {} lvq1 {} lvq2 {} lvq3 {}".format(k, n, np.mean(accuracies_simples),np.mean(accuracies_lvq1),np.mean(accuracies_lvq2), np.mean(accuracies_lvq3)))
            temp = {
                'dataframe': title,
                '(k, prototypes)':(k, n),
                'acc_simples': np.mean(accuracies_simples),
                'std_simples': np.std(accuracies_simples),
                'acc_LVQ1'   : np.mean(accuracies_lvq1),
                'std_LVQ1'   : np.std(accuracies_lvq1),
                'acc_LVQ2.1' : np.mean(accuracies_lvq2),
                'std_LVQ2.1' : np.std(accuracies_lvq2),
                'acc_LVQ3'   : np.mean(accuracies_lvq3),
                'std_LVQ3'   : np.std(accuracies_lvq3),
            }
            results.append(temp)
    df = pd.DataFrame(results)
    return df

df = run(kc2, 'KC2', accuracies_1, accuracies_3)
df

(1, 10): simples 98.46520146520147 lvq1 75.65201465201467 lvq2 65.84798534798536 lvq3 75.84432234432236
(3, 10): simples 88.13003663003664 lvq1 77.1923076923077 lvq2 79.88461538461539 lvq3 79.6923076923077
(1, 50): simples 98.46520146520147 lvq1 78.52564102564102 lvq2 44.8003663003663 lvq3 77.38095238095238
(3, 50): simples 88.13003663003664 lvq1 80.83699633699634 lvq2 79.88461538461539 lvq3 81.22161172161171
(1, 100): simples 98.46520146520147 lvq1 83.90293040293041 lvq2 70.84432234432234 lvq3 81.21245421245422
(3, 100): simples 88.13003663003664 lvq1 83.71978021978022 lvq2 62.77106227106227 lvq3 83.14652014652015
(1, 200): simples 98.46520146520147 lvq1 77.58424908424908 lvq2 41.92673992673993 lvq3 81.59706959706959
(3, 200): simples 88.13003663003664 lvq1 84.4835164835165 lvq2 55.77289377289378 lvq3 84.67216117216117


Unnamed: 0,dataframe,"(k, prototypes)",acc_simples,std_simples,acc_LVQ1,std_LVQ1,acc_LVQ2.1,std_LVQ2.1,acc_LVQ3,std_LVQ3
0,KC2,"(1, 10)",98.465201,0.771986,75.652015,7.704163,65.847985,25.311351,75.844322,7.328297
1,KC2,"(3, 10)",88.130037,2.721456,77.192308,4.320216,79.884615,3.101169,79.692308,2.464854
2,KC2,"(1, 50)",98.465201,0.771986,78.525641,10.467003,44.800366,29.610129,77.380952,11.554064
3,KC2,"(3, 50)",88.130037,2.721456,80.836996,6.728811,79.884615,3.271954,81.221612,6.632904
4,KC2,"(1, 100)",98.465201,0.771986,83.90293,2.705472,70.844322,21.598867,81.212454,7.365939
5,KC2,"(3, 100)",88.130037,2.721456,83.71978,2.895797,62.771062,25.275883,83.14652,2.727312
6,KC2,"(1, 200)",98.465201,0.771986,77.584249,8.650215,41.92674,28.744571,81.59707,7.281588
7,KC2,"(3, 200)",88.130037,2.721456,84.483516,3.396943,55.772894,29.293371,84.672161,2.923965


menos protótipos > resultado mto ruim > desbalanço do conjunto

## Plot

curva roc?

In [22]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [23]:
def _printComparative(df):
    

    fig = make_subplots(rows=1, cols=1,
    specs=[[{"type": "Scatter"}]],
    subplot_titles=("Acurácia"))

    
    fig.add_trace(go.Scatter(x=df.index, y = df['acc_simples'], name='Simples', mode = 'lines+markers'), col = 1, row = 1) 
    fig.add_trace(go.Scatter(x=df.index, y = df['acc_LVQ1'], name='LVQ1', mode = 'lines+markers'), col = 1, row = 1)
    fig.add_trace(go.Scatter(x=df.index, y = df['acc_LVQ2.1'], name='LVQ2.1', mode = 'lines+markers'), col = 1, row = 1)
    fig.add_trace(go.Scatter(x=df.index, y = df['acc_LVQ3'], name='LVQ3', mode = 'lines+markers'), col = 1, row = 1)
    
    fig.update_layout(height=1000, width=1000)

    return fig


In [24]:
fig = _printComparative(df)
fig.show()