https://docs.google.com/forms/d/e/1FAIpQLSdI6cWyyHCBkn2h0lUXvZM9iGNX3y1QMRbKT0iSVsrm8Qhx_w/viewform?hr_submission=ChcIrpDc8gMSDwi72JqF0wgSBgiGjdbHJxAB

In [1]:
import pandas as pd
import numpy as np

import urllib.request
import io
import time
import copy


from scipy.io import arff
from sklearn.model_selection import KFold
from collections import Counter

from sklearn.preprocessing import MinMaxScaler
import re
from random import randrange


# Preparação de dados

* extração do dataframe

In [None]:
url = 'http://promise.site.uottawa.ca/SERepository/datasets/kc1.arff'
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
kc1 = pd.DataFrame(data)
kc1 = kc1.sample(frac=1, random_state=20)
kc1.reset_index(drop=True, inplace=True)

In [None]:
url = 'http://promise.site.uottawa.ca/SERepository/datasets/kc2.arff'
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
kc2 = pd.DataFrame(data)
kc2 = kc2.sample(frac=1, random_state=20)
kc2.reset_index(drop=True, inplace=True)

# Algoritmos

In [None]:
def  euclidian_distance(x1, x2):
    arr = []
    for i in range(len(x1)-1):
        arr.append((x1[i] - x2[i])** 2)
    return np.sqrt(np.sum(arr))


## Learning Vector Quantization

### LVQ 1

In [None]:
class LVQ1:   
    
    def __init__(self, n_prototypes):
        self.n_prototypes  = n_prototypes
        self.epochs        = 10
        self.l_rate        = 0.25
        
    def getNeighborPrototype(self, prototypes, row):
        distances = []
        for proto in prototypes:
            dist = euclidian_distance(proto, row)
            distances.append((proto, dist))
        distances.sort(key=lambda tup: tup[1])
        return distances[0][0]
    
    def random_prototype(self, train):
        n_records  = train.shape[0]
        n_features = train.shape[1]
        prototype = []
        prototype = [train.iloc[randrange(n_records)][i] for i in range(n_features)]
        return prototype            
    
    def trainPrototypes(self,train):
        prototypes = [self.random_prototype(train) for i in range(self.n_prototypes)]
        for epoch in range(self.epochs):
            rate = self.l_rate * (1 - (epoch/float(self.epochs) ) )
            for _, row in train.iterrows():
                n_proto = self.getNeighborPrototype(prototypes, row)
                for i in range(len(row) -1):
                    error = (row[i]) - (n_proto[i])
                    if(row[-1] == n_proto[-1]):
                        n_proto[i] += rate * error
                    else:
                        n_proto[i] -= rate * error
        return prototypes

    

In [None]:
lvq1 = LVQ1(10)
res  = lvq1.trainPrototypes(kc2)

df = pd.DataFrame(res, columns = kc2.columns)
df

### LVQ 2.1

In [None]:
class LVQ2_1:   
    
    def __init__(self, n_prototypes, window):
        self.n_prototypes  = n_prototypes
        self._epochs        = 10
        self._lrate        = 0.25
        self._window        = window
        
    def getNeighborPrototype(self, prototypes, row):
        distances  = []
        final_dist = []
        for _,proto in prototypes.iterrows():
            dist = euclidian_distance(proto, row)
            distances.append((proto, dist))
        distances.sort(key=lambda tup: tup[1])
        final_dist.append(distances[1])
        final_dist.append(distances[2])
        return final_dist
    
    def inside(self):
        return (1.0 - self._window) / (1.0 + self._window)
    
    def window(self, neighbors):
        di = neighbors[0][1]
        dj = neighbors[1][1]
        a  = di/dj
        b  = dj/di
        minimum = min(a,b)
        return minimum > self.inside()
    
    def trainPrototypes(self,train):
        prototypes = copy.deepcopy(train)
        for epoch in range(self._epochs):
            rate = self._lrate * (1 - (epoch / float(self._epochs) ) )
            for _, row in train.iterrows():
                n_proto = self.getNeighborPrototype(prototypes, row)
                
                n1 = n_proto[0][0]
                n2 = n_proto[1][0]

                isWindow = self.window(n_proto)
                if isWindow or n1[-1] != n2[-1]:
                    for i in range(len(row) - 2):
                        error = (row[i]) - (n1[i])
                        if n1[-1] == row[-1]:
                            n1[i] += rate * error
                        else:
                            n1[i] -= rate * error


        return prototypes

In [None]:
lvq2 = LVQ2_1(10, 0.25)
res  = lvq2.trainPrototypes(test)
df = pd.DataFrame(res, columns = kc2.columns)
df

### LVQ 3

## KNN 


In [None]:
class KNN:

    def __init__(self, k):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y


    def predict(self, X):
        predicted_labels = []
        for _, row in X.iterrows():
            predicted_labels.append(self.predict_func(row))

        return np.array(predicted_labels)
        
    def predict_func(self, x):
        distances = []
        for _, row in self.X_train.iterrows():
            distances.append(euclidian_distance(x, row))
          
        k_indices = np.argsort(distances)[:self.k]        
        k_nearest_labels = [self.y_train.iloc[i] for i in k_indices]

        unique, counts = np.unique(k_nearest_labels, return_counts=True)
        predicted = unique[counts.argmax()]

        return predicted

In [None]:
def attr_class(df, _class):
    Y = df[_class]
    X = df.drop(columns=[_class]) 
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    x_scaled = scaler.fit_transform(X)
    X = pd.DataFrame(x_scaled)
    
    return X, Y

In [None]:
def run(df, _class, title):
    kf = KFold(n_splits=5)
    results = []
    n_proto = [10, 25, 50, 100, 200]
    for n in n_proto:
        for k in [1,3]:
            accuracies_simples = []
            accuracies_lvq1   = []
            accuracies_lvq2   = []
            for train, test in kf.split(df):
                #separação conj de teste atributos e classe
                X, Y = attr_class(df, _class)

                # KNN com conjunto normal
                knn  = KNN(k)
                knn.fit(X, Y)
                predictions_simples = knn.predict(attr.iloc[test])
                acc_simples = (np.sum(predictions_simples == df_class.iloc[test]) / len(test)) * 100
                accuracies_simples.append(acc_simples)
                
                #definição do dataset de LVQ1
                lvq1 = LVQ1(n)
                res  = lvq1.trainPrototypes(df.iloc[train])
                lvq1_df  = pd.DataFrame(res, columns = df.columns)
                
                #separação do treinamento LVQ1 atributos e classe
                X, Y = attr_class(lvq1_df, _class)
                
                #KNN com conjunto do LVQ1
                knn  = KNN(k)
                knn.fit(X, Y)
                predictions_lvq1 = knn.predict(attr.iloc[test])
                acc_lvq1 = (np.sum(predictions_lvq1 == df_class.iloc[test]) / len(test)) * 100
                accuracies_lvq1.append(acc_lvq1)
                
                #definição do dataset de LVQ2.1
                lvq2 = LVQ2_1(n, 0.25)
                res  = lvq2.trainPrototypes(lvq1_df.iloc[train])
                lvq2_df  = pd.DataFrame(res, columns = df.columns)
                
                #separação do treinamento LVQ2.1 atributos e classe
                X, Y = attr_class(lvq2_df, _class)
                
                #KNN com conjunto do LVQ2.1
                knn  = KNN(k)
                knn.fit(X, Y)
                predictions_lvq2 = knn.predict(attr.iloc[test])
                acc_lvq2 = (np.sum(predictions_lvq2 == df_class.iloc[test]) / len(test)) * 100
                accuracies_lvq2.append(acc_lvq2)


            temp = {
                'dataframe': title,
                '(k, prototypes)':(k, n),
                'acc_simples': np.mean(accuracies_simples),
                'std_simples': np.std(accuracies_simples),
                'acc_LVQ1': np.mean(accuracies_lvq1),
                'std_LVQ1': np.std(accuracies_lvq1),
                'acc_LVQ2.1': np.mean(accuracies_lvq2),
                'std_LVQ2.1': np.std(accuracies_lvq2),
            }
            results.append(temp)
    df = pd.DataFrame(results)
    return df

df = run(kc2, 'problems', 'KC2')
df

## Plot

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [None]:
def _printComparative(df):
    

    fig = make_subplots(rows=1, cols=1,
    specs=[[{"type": "Scatter"}]],
    subplot_titles=("Acurácia"))

    
    fig.add_trace(go.Scatter(x=df.index, y = df['acc_simples'], name='Simples', mode = 'lines+markers'), col = 1, row = 1) 
    fig.add_trace(go.Scatter(x=df.index, y = df['acc_LVQ1'], name='LVQ1', mode = 'lines+markers'), col = 1, row = 1)
    fig.add_trace(go.Scatter(x=df.index, y = df['acc_LVQ2.1'], name='LVQ2.1', mode = 'lines+markers'), col = 1, row = 1)
    
    fig.update_layout(height=1000, width=1000)

    return fig


In [None]:
fig = _printComparative(df)
fig.show()