https://docs.google.com/forms/d/e/1FAIpQLSdI6cWyyHCBkn2h0lUXvZM9iGNX3y1QMRbKT0iSVsrm8Qhx_w/viewform?hr_submission=ChcIrpDc8gMSDwi72JqF0wgSBgiGjdbHJxAB

In [1]:
import pandas as pd
import numpy as np

import urllib.request
import io
import time
import copy


from scipy.io import arff
from sklearn.model_selection import KFold
from collections import Counter

from sklearn.preprocessing import MinMaxScaler
import re
from random import randrange


# Preparação de dados

* extração do dataframe

In [2]:
url = 'http://promise.site.uottawa.ca/SERepository/datasets/kc1.arff'
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
kc1 = pd.DataFrame(data)
kc1 = kc1.sample(frac=1, random_state=20)
kc1.reset_index(drop=True, inplace=True)

In [3]:
url = 'http://promise.site.uottawa.ca/SERepository/datasets/kc2.arff'
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
kc2 = pd.DataFrame(data)
kc2 = kc2.sample(frac=1, random_state=20)
kc2.reset_index(drop=True, inplace=True)

# Algoritmos

In [4]:
def  euclidian_distance(x1, x2):
    arr = []
    for i in range(len(x1)-1):
        arr.append((x1[i] - x2[i])** 2)
    return np.sqrt(np.sum(arr))


## Learning Vector Quantization

### LVQ 1

In [5]:
class LVQ1:   
    
    def __init__(self, n_prototypes):
        self.n_prototypes  = n_prototypes
        self.epochs        = 10
        self.l_rate        = 0.25
        
    def getNeighborPrototype(self, prototypes, row):
        distances = []
        for proto in prototypes:
            dist = euclidian_distance(proto, row)
            distances.append((proto, dist))
        distances.sort(key=lambda tup: tup[1])
        return distances[0][0]
    
    def random_prototype(self, train):
        n_records  = train.shape[0]
        n_features = train.shape[1]
        prototype = []
        prototype = [train.iloc[randrange(n_records)][i] for i in range(n_features)]
        return prototype            
    
    def trainPrototypes(self,train):
        prototypes = [self.random_prototype(train) for i in range(self.n_prototypes)]
        for epoch in range(self.epochs):
            rate = self.l_rate * (1 - (epoch/float(self.epochs) ) )
            for _, row in train.iterrows():
                n_proto = self.getNeighborPrototype(prototypes, row)
                for i in range(len(row) -1):
                    error = (row[i]) - (n_proto[i])
                    if(row[-1] == n_proto[-1]):
                        n_proto[i] += rate * error
                    else:
                        n_proto[i] -= rate * error
        return prototypes

    

In [6]:
lvq1 = LVQ1(10)
res  = lvq1.trainPrototypes(kc2)

df = pd.DataFrame(res, columns = kc2.columns)
df

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,lOCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,problems
0,-354.322886,24.143254,6.43038,9.321776,-1125.643995,39687.614017,-3.218654,12.325464,-317.37205,-89416.75,...,372.064428,17.286992,142.30286,-2.403674,35.350395,-214.043266,695.721318,-599.829642,-42.20296,b'yes'
1,-92.277097,-15.296767,-2.287849,-9.461425,-309.349604,-1991.154614,0.833046,-16.476355,-44.378064,-84376.67,...,-75.650139,-5.683456,-9.380122,-3.008971,-4.259133,-30.145506,-178.90435,-129.33569,-32.712264,b'no'
2,8451.696972,2481.145411,87.632546,24.38873,-10617.83972,-57758.64286,103.398226,-1540.758437,-4867.456481,-56088.96,...,1421.107407,7.694363,-593.471549,14.605504,-734.820127,-2562.638314,8845.189453,-4663.032743,192.174484,b'yes'
3,-110.648277,-18.835094,-7.431491,-9.26056,-287.662483,-2168.023083,0.378138,-4.644287,-26.954867,-119172.8,...,-83.543892,-8.126048,-11.99098,-1.568911,3.585468,-24.152901,-179.121212,-122.15321,-37.078839,b'no'
4,2849.722134,-58.702746,-1.075496,-36.250042,-1073.54749,27870.728955,9.212639,-80.671998,560.208487,-77916.82,...,229.836248,54.165785,-19.209856,120.542262,27.36005,487.382013,-713.424354,97.423804,-20.454265,b'yes'
5,-99.37916,-16.945821,-6.671703,-8.263759,-257.598395,-1947.508863,0.373917,-3.549713,-23.906106,-106720.1,...,-74.398714,-7.223819,-10.714868,-1.443906,3.826924,-21.76547,-161.818876,-110.278893,-33.193605,b'no'
6,-87.053223,50.967773,-25.767822,-46.76123,-1198.205811,-7510.884897,0.167007,-21.12686,-192.453689,-133872.8,...,-361.928955,-28.415527,-46.67627,-5.705566,-17.383545,-88.506836,-365.65625,-470.15332,-117.566895,b'no'
7,786.69632,112.381961,71.873738,87.686861,2427.264275,19552.334382,0.013485,81.366251,249.674457,1339630.0,...,659.259598,37.953915,77.709499,7.100726,35.3062,200.491773,1490.13916,937.832017,221.309296,b'yes'
8,-13.127739,-1.687852,-0.602748,-0.180753,-43.963087,-327.534254,0.432394,0.737386,0.239205,-11235.11,...,-11.286685,-0.930843,-1.568404,-0.152504,3.748877,-4.119629,-27.341208,-20.259045,-4.082923,b'no'
9,-87.947271,-15.03103,-5.896068,-7.249341,-227.315737,-1725.91205,0.369751,-2.460258,-20.732885,-94262.97,...,-65.220017,-6.337277,-9.417536,-1.315021,4.059203,-19.311612,-144.23926,-98.19692,-29.290469,b'no'


### LVQ 2.1

In [7]:
class LVQ2_1:   
    
    def __init__(self, n_prototypes, window):
        self.n_prototypes  = n_prototypes
        self._epochs        = 10
        self._lrate        = 0.25
        self._window        = window
        
    def getNeighborPrototype(self, prototypes, row):
        distances  = []
        final_dist = []
        for _,proto in prototypes.iterrows():
            dist = euclidian_distance(proto, row)
            distances.append((proto, dist))
        distances.sort(key=lambda tup: tup[1])
        final_dist.append(distances[1])
        final_dist.append(distances[2])
        return final_dist
    
    def inside(self):
        return (1.0 - self._window) / (1.0 + self._window)
    
    def window(self, neighbors):
        di = neighbors[0][1]
        dj = neighbors[1][1]
        a  = di/dj
        b  = dj/di
        minimum = min(a,b)
        return minimum > self.inside()
    
    def trainPrototypes(self,train):
        prototypes = copy.deepcopy(train)
        for epoch in range(self._epochs):
            rate = self._lrate * (1 - (epoch / float(self._epochs) ) )
            for _, row in train.iterrows():
                n_proto = self.getNeighborPrototype(prototypes, row)
                
                n1 = n_proto[0][0]
                n2 = n_proto[1][0]

                isWindow = self.window(n_proto)
                if isWindow or n1[-1] != n2[-1]:
                    for i in range(len(row) - 2):
                        error = (row[i]) - (n1[i])
                        if n1[-1] == row[-1]:
                            n1[i] += rate * error
                        else:
                            n1[i] -= rate * error


        return prototypes

In [9]:
lvq2 = LVQ2_1(10, 0.25)
res  = lvq2.trainPrototypes(df)
df = pd.DataFrame(res, columns = kc2.columns)
df

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,lOCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,problems
0,-354.322886,24.143254,6.43038,9.321776,-1125.643995,39687.614017,-3.218654,12.325464,-317.37205,-89416.75,...,372.064428,17.286992,142.30286,-2.403674,35.350395,-214.043266,695.721318,-599.829642,-42.20296,b'yes'
1,-92.277097,-15.296767,-2.287849,-9.461425,-309.349604,-1991.154614,0.833046,-16.476355,-44.378064,-84376.67,...,-75.650139,-5.683456,-9.380122,-3.008971,-4.259133,-30.145506,-178.90435,-129.33569,-32.712264,b'no'
2,8451.696972,2481.145411,87.632546,24.38873,-10617.83972,-57758.64286,103.398226,-1540.758437,-4867.456481,-56088.96,...,1421.107407,7.694363,-593.471549,14.605504,-734.820127,-2562.638314,8845.189453,-4663.032743,192.174484,b'yes'
3,-110.648277,-18.835094,-7.431491,-9.26056,-287.662483,-2168.023083,0.378138,-4.644287,-26.954867,-119172.8,...,-83.543892,-8.126048,-11.99098,-1.568911,3.585468,-24.152901,-179.121212,-122.15321,-37.078839,b'no'
4,2849.722134,-58.702746,-1.075496,-36.250042,-1073.54749,27870.728955,9.212639,-80.671998,560.208487,-77916.82,...,229.836248,54.165785,-19.209856,120.542262,27.36005,487.382013,-713.424354,97.423804,-20.454265,b'yes'
5,-99.37916,-16.945821,-6.671703,-8.263759,-257.598395,-1947.508863,0.373917,-3.549713,-23.906106,-106720.1,...,-74.398714,-7.223819,-10.714868,-1.443906,3.826924,-21.76547,-161.818876,-110.278893,-33.193605,b'no'
6,-87.053223,50.967773,-25.767822,-46.76123,-1198.205811,-7510.884897,0.167007,-21.12686,-192.453689,-133872.8,...,-361.928955,-28.415527,-46.67627,-5.705566,-17.383545,-88.506836,-365.65625,-470.15332,-117.566895,b'no'
7,786.69632,112.381961,71.873738,87.686861,2427.264275,19552.334382,0.013485,81.366251,249.674457,1339630.0,...,659.259598,37.953915,77.709499,7.100726,35.3062,200.491773,1490.13916,937.832017,221.309296,b'yes'
8,-13.127739,-1.687852,-0.602748,-0.180753,-43.963087,-327.534254,0.432394,0.737386,0.239205,-11235.11,...,-11.286685,-0.930843,-1.568404,-0.152504,3.748877,-4.119629,-27.341208,-20.259045,-4.082923,b'no'
9,-87.947271,-15.03103,-5.896068,-7.249341,-227.315737,-1725.91205,0.369751,-2.460258,-20.732885,-94262.97,...,-65.220017,-6.337277,-9.417536,-1.315021,4.059203,-19.311612,-144.23926,-98.19692,-29.290469,b'no'


### LVQ 3

## KNN 


In [10]:
class KNN:

    def __init__(self, k):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y


    def predict(self, X):
        predicted_labels = []
        for _, row in X.iterrows():
            predicted_labels.append(self.predict_func(row))

        return np.array(predicted_labels)
        
    def predict_func(self, x):
        distances = []
        for _, row in self.X_train.iterrows():
            distances.append(euclidian_distance(x, row))
          
        k_indices = np.argsort(distances)[:self.k]        
        k_nearest_labels = [self.y_train.iloc[i] for i in k_indices]

        unique, counts = np.unique(k_nearest_labels, return_counts=True)
        predicted = unique[counts.argmax()]

        return predicted

In [11]:
def attr_class(df, _class):
    Y = df[_class]
    X = df.drop(columns=[_class]) 
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    x_scaled = scaler.fit_transform(X)
    X = pd.DataFrame(x_scaled)
    
    return X, Y

In [18]:
def run(df, _class, title):
    kf = KFold(n_splits=5)
    results = []
    n_proto = [10, 50, 100, 200]
    for n in n_proto:
        for k in [1,3]:
            accuracies_simples = []
            accuracies_lvq1   = []
            accuracies_lvq2   = []
            index = 0
            for train, test in kf.split(df):
                #separação conj de teste atributos e classe
                attr, df_class = attr_class(df, _class)

                # KNN com conjunto normal
                knn  = KNN(k)
                knn.fit(attr, df_class)
                predictions_simples = knn.predict(attr.iloc[test])
                acc_simples = (np.sum(predictions_simples == df_class.iloc[test]) / len(test)) * 100
                accuracies_simples.append(acc_simples)
                
                #definição do dataset de LVQ1
                lvq1 = LVQ1(n)
                res  = lvq1.trainPrototypes(df.iloc[train])
                lvq1_df  = pd.DataFrame(res, columns = df.columns)
                
                #separação do treinamento LVQ1 atributos e classe
                X, Y = attr_class(lvq1_df, _class)
                
                #KNN com conjunto do LVQ1
                knn  = KNN(k)
                knn.fit(X, Y)
                predictions_lvq1 = knn.predict(attr.iloc[test])
                acc_lvq1 = (np.sum(predictions_lvq1 == df_class.iloc[test]) / len(test)) * 100
                accuracies_lvq1.append(acc_lvq1)
                
                #definição do dataset de LVQ2.1
                lvq2 = LVQ2_1(n, 0.25)
                res  = lvq2.trainPrototypes(lvq1_df)
                lvq2_df  = pd.DataFrame(res, columns = df.columns)
                
                #separação do treinamento LVQ2.1 atributos e classe
                X, Y = attr_class(lvq2_df, _class)
                
                #KNN com conjunto do LVQ2.1
                knn  = KNN(k)
                knn.fit(X, Y)
                predictions_lvq2 = knn.predict(attr.iloc[test])
                acc_lvq2 = (np.sum(predictions_lvq2 == df_class.iloc[test]) / len(test)) * 100
                accuracies_lvq2.append(acc_lvq2)
            
            print("({}, {}): simples {} lvq1 {} lvq2 {}".format(k, n, np.mean(accuracies_simples),np.mean(accuracies_lvq1),np.mean(accuracies_lvq2)))
            temp = {
                'dataframe': title,
                '(k, prototypes)':(k, n),
                'acc_simples': np.mean(accuracies_simples),
                'std_simples': np.std(accuracies_simples),
                'acc_LVQ1': np.mean(accuracies_lvq1),
                'std_LVQ1': np.std(accuracies_lvq1),
                'acc_LVQ2.1': np.mean(accuracies_lvq2),
                'std_LVQ2.1': np.std(accuracies_lvq2),
            }
            results.append(temp)
    df = pd.DataFrame(results)
    return df

df = run(kc2, 'problems', 'KC2')
df

(1, 10): simples 98.46520146520147 lvq1 68.26739926739927 lvq2 68.26739926739927
(3, 10): simples 88.13003663003664 lvq1 79.88644688644688 lvq2 79.88644688644688
(1, 50): simples 98.46520146520147 lvq1 80.07509157509158 lvq2 80.07509157509158
(3, 50): simples 88.13003663003664 lvq1 79.88644688644688 lvq2 79.88644688644688
(1, 100): simples 98.46520146520147 lvq1 79.69413919413918 lvq2 79.69413919413918
(3, 100): simples 88.13003663003664 lvq1 79.88461538461539 lvq2 79.88461538461539
(1, 200): simples 98.46520146520147 lvq1 79.69413919413918 lvq2 79.69413919413918
(3, 200): simples 88.13003663003664 lvq1 79.5018315018315 lvq2 79.5018315018315


Unnamed: 0,dataframe,"(k, prototypes)",acc_simples,std_simples,acc_LVQ1,std_LVQ1,acc_LVQ2.1,std_LVQ2.1
0,KC2,"(1, 10)",98.465201,0.771986,68.267399,24.846289,68.267399,24.846289
1,KC2,"(3, 10)",88.130037,2.721456,79.886447,3.481658,79.886447,3.481658
2,KC2,"(1, 50)",98.465201,0.771986,80.075092,3.188909,80.075092,3.188909
3,KC2,"(3, 50)",88.130037,2.721456,79.886447,3.481658,79.886447,3.481658
4,KC2,"(1, 100)",98.465201,0.771986,79.694139,3.175095,79.694139,3.175095
5,KC2,"(3, 100)",88.130037,2.721456,79.884615,3.271954,79.884615,3.271954
6,KC2,"(1, 200)",98.465201,0.771986,79.694139,3.175095,79.694139,3.175095
7,KC2,"(3, 200)",88.130037,2.721456,79.501832,3.191471,79.501832,3.191471


## Plot

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [None]:
def _printComparative(df):
    

    fig = make_subplots(rows=1, cols=1,
    specs=[[{"type": "Scatter"}]],
    subplot_titles=("Acurácia"))

    
    fig.add_trace(go.Scatter(x=df.index, y = df['acc_simples'], name='Simples', mode = 'lines+markers'), col = 1, row = 1) 
    fig.add_trace(go.Scatter(x=df.index, y = df['acc_LVQ1'], name='LVQ1', mode = 'lines+markers'), col = 1, row = 1)
    fig.add_trace(go.Scatter(x=df.index, y = df['acc_LVQ2.1'], name='LVQ2.1', mode = 'lines+markers'), col = 1, row = 1)
    
    fig.update_layout(height=1000, width=1000)

    return fig


In [None]:
fig = _printComparative(df)
fig.show()

10:15
11:37 - faltam os 200 (mas já ta rodando eles tem um time