https://docs.google.com/forms/d/e/1FAIpQLSc7AeSivMlUKS490NAsBtZ9fweQvBQAZGlR-aKbGq2GIdS-_A/viewform?hr_submission=ChcIrpDc8gMSDwjt08HinAgSBgiGjdbHJxAB&authuser=1

In [1]:
import pandas as pd
import numpy as np

import urllib.request
import io
import time


from scipy.io import arff
from sklearn.model_selection import KFold
from collections import Counter

from sklearn.preprocessing import MinMaxScaler
import re

# Preparação de dados

* extração do dataframe

In [2]:
url = 'http://promise.site.uottawa.ca/SERepository/datasets/pc1.arff'
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
pc1 = pd.DataFrame(data)

Y_pc1 = pc1['defects']
X_pc1 = pc1.drop(columns=['defects'])

#Transforming in boolean class
Y_pc1 = Y_pc1.apply(str).str.replace("b|'", '')
Y_pc1 = Y_pc1.apply(lambda x: True if x == 'true' else False)

#Normalizing values
scaler = MinMaxScaler(feature_range=(0, 1))
x_scaled = scaler.fit_transform(X_pc1)
X_pc1 = pd.DataFrame(x_scaled)

In [3]:
url = 'http://promise.site.uottawa.ca/SERepository/datasets/kc2.arff'
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
kc2 = pd.DataFrame(data)

Y_kc2 = kc2['problems']
X_kc2 = kc2.drop(columns=['problems'])

#Transforming in boolean class
Y_kc2 = Y_kc2.apply(str).str.replace("b|'", '')
Y_kc2 = Y_kc2.apply(lambda x: True if x == 'yes' else False)

#Normalizing values
scaler = MinMaxScaler(feature_range=(0, 1))
x_scaled = scaler.fit_transform(X_kc2)
X_kc2 = pd.DataFrame(x_scaled)

# Algoritmos

In [4]:
def  euclidian_distance(x1, x2):
    x = [x1.iloc[i] for i in range(len(x1))]
    return np.sqrt(np.sum((x-x2)**2))


In [36]:
def define_knn(algorithm, k):
    if algorithm == 'adaptive':
        return adaptive_KNN(k=k)
    if algorithm ==  'weight':
        return weight_KNN(k=k)
    return KNN(k=k)

In [6]:
def run(X, Y, algorithm, df):
    kf = KFold(n_splits=5)

    results = []
    totalTime = 0

    for k in [1,2,3,5,7,9,11,13,15]:
        begin = time.time()

        accuracies = []
        knn = define_knn(algorithm, k)

        for train, test in kf.split(X):
            knn.fit(X.iloc[train], Y.iloc[train])
            predictions = knn.predict(X.iloc[test])

            acc = (np.sum(predictions == Y.iloc[test]) / len(test)) * 100
            accuracies.append(acc)
        end = time.time()
        totalTime += end-begin
        temp= {'k': k,
            'mean': np.mean(accuracies),
            'std': np.std(accuracies),
            'time':end - begin,
            'df': df}

        results.append(temp)


    print("Tempo total para df {}: {}".format(df, totalTime))

    return pd.DataFrame(results)
  

## KNN 


In [7]:
class KNN:

    def __init__(self, k):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y


    def predict(self, X):
        predicted_labels = []
        for _, row in X.iterrows():
            predicted_labels.append(self.predict_func(row))

        return np.array(predicted_labels)
        
    def predict_func(self, x):
        distances = []
        for _, row in self.X_train.iterrows():
            distances.append(euclidian_distance(x, row))
          
        k_indices = np.argsort(distances)[:self.k]        
        k_nearest_labels = [self.y_train.iloc[i] for i in k_indices]

        unique, counts = np.unique(k_nearest_labels, return_counts=True)
        predicted = unique[counts.argmax()]

        return predicted

In [8]:
kc2_df_knn = run(X_kc2, Y_kc2, 'knn', 'KC2')
kc2_df_knn


Tempo total para df KC2: 2549.276913881302


Unnamed: 0,k,mean,std,time,df
0,1,65.09707,25.407862,252.807636,KC2
1,2,72.74359,32.226015,290.484875,KC2
2,3,68.919414,29.443651,298.203528,KC2
3,5,72.53663,32.617817,220.148889,KC2
4,7,73.68315,34.073695,216.911322,KC2
5,9,73.108059,34.234855,227.077673,KC2
6,11,72.915751,35.158931,356.304306,KC2
7,13,71.959707,34.668995,328.961653,KC2
8,15,71.767399,34.558081,358.377032,KC2


In [9]:
pc1_df_knn = run(X_pc1, Y_pc1,'knn', 'PC1')
pc1_df_knn


Tempo total para df PC1: 8578.659676551819


Unnamed: 0,k,mean,std,time,df
0,1,86.124088,11.536515,1470.327176,PC1
1,2,90.27027,13.001096,1415.996934,PC1
2,3,88.737923,12.394484,1371.914071,PC1
3,5,90.089682,12.708468,756.489119,PC1
4,7,90.990583,12.986461,742.431059,PC1
5,9,92.162162,13.455129,727.684392,PC1
6,11,92.252252,13.505703,720.398937,PC1
7,13,92.342342,13.540514,724.80206,PC1
8,15,92.342342,13.540514,648.615929,PC1


## KNN com peso


In [37]:
class weight_KNN:

    def __init__(self, k):
        self.k = k
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def weight(self,weighted, k_nearest_labels):
        pos = 0
        neg = 0
        for i in range(len(weighted)):  
            if k_nearest_labels[i] == True:
                pos += weighted[i]
            else:
                neg += weighted[i]
        return True if pos > neg else False
    
    def distance(self, x):
        distances = []
        for _, row in self.X_train.iterrows():
            distances.append(euclidian_distance(x, row))
        return distances

    def predict(self, X):
        predicted_labels = []
        for _, row in X.iterrows():
            predicted_labels.append(self.predict_func(row))

        return np.array(predicted_labels)

    def predict_func(self, x):
        distances = []
        for _, row in self.X_train.iterrows():
            distances.append(euclidian_distance(x, row))
            
        k_indices = np.argsort(distances)[:self.k]        
        k_nearest_labels = [self.y_train.iloc[i] for i in k_indices]

        weighted = [(1/(k**2) if k!= 0 else 0) for k in k_indices]
        predicted = self.weight(weighted, k_nearest_labels)
        
        return predicted

In [38]:
kc2_df_weight = run(X_kc2, Y_kc2, 'weight','KC2')
kc2_df_weight
#tava kc2_df

Tempo total para df KC2: 1014.8650913238525


Unnamed: 0,k,mean,std,time,df
0,1,65.09707,25.407862,107.472769,KC2
1,2,74.283883,28.733271,108.481937,KC2
2,3,79.258242,29.117308,109.198128,KC2
3,5,80.212454,29.064423,109.281673,KC2
4,7,81.362637,29.182398,110.024766,KC2
5,9,82.126374,29.059932,115.532238,KC2
6,11,83.084249,28.100586,115.784961,KC2
7,13,83.276557,27.716248,115.997867,KC2
8,15,83.85348,27.513875,123.090753,KC2


In [74]:
pc1_df_weight = run(X_pc1, Y_pc1,'weight', 'pc1')
pc1_df_weight
#tava pc1_df

Tempo total para df pc1: 5821.450756072998


Unnamed: 0,k,mean,std,time,df
0,1,86.124088,11.536515,709.933398,pc1
1,2,81.797725,10.363201,752.11466,pc1
2,3,78.373487,10.176777,854.730891,pc1
3,5,73.327219,11.364935,594.230946,pc1
4,7,69.359993,11.76468,579.655435,pc1
5,9,66.294892,12.304727,582.318637,pc1
6,11,63.409156,12.204109,591.438612,pc1
7,13,59.620073,12.465629,571.512746,pc1
8,15,57.365374,12.614876,585.515432,pc1


## KNN Adaptativo


In [18]:
import random
max_dist = 99999

class adaptive_KNN:

    def __init__(self, k):
        self.k = k
        
    def adaptive_rule(self):
        epsilon = 0.00000000001
        radius = []
        for idx in range(len(self.y_train)):
            temp = []
            for jdx in range(len(self.y_train)):
                if self.y_train.iloc[idx] != self.y_train.iloc[jdx]:
                    dist = euclidian_distance(self.X_train.iloc[idx], self.X_train.iloc[jdx]) - epsilon
                    if dist >= 0:
                        temp.append(dist)
            if len(temp) == 0:
                temp.append(max_dist)
            radius.append(min(temp))
        return radius
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.radius = self.adaptive_rule()


    def predict(self, X):
        predicted_labels = []
        for _, row in X.iterrows():
            predicted_labels.append(self.predict_func(row))

        return np.array(predicted_labels)
        
    def predict_label(self, k_nearest_radius, pos=0, neg=0):
        for k in range(len(k_nearest_radius)):
            if k_nearest_radius[k] == 99999:
                pass
            elif self.y_train.iloc[k] == True:
                ++pos
            elif self.y_train.iloc[k] == False:
                ++neg
        if pos > neg:
            return True
        elif neg > pos:
            return False
        else: 
            return True if random.randint(1,100) % 2 == 0 else False
        
    def predict_func(self, x):
        distances = []
        for _, row in self.X_train.iterrows():
            calc = euclidian_distance(x, row)
            distances.append(calc)
        
        rad_distance = []
        max_dist = 99999
        for d in range(len(distances)):
            aux = distances[d] / self.radius[d]
            if(aux > 1):
                rad_distance.append(max_dist)
            else:
                rad_distance.append(aux)

        k_indices = np.argsort(rad_distance)[:self.k]        
        k_nearest_radius = [self.y_train.iloc[i] for i in k_indices]

        return self.predict_label(k_nearest_radius,0,0)

In [19]:
kc2_df_adaptive  = run(X_kc2, Y_kc2, 'adaptive', 'KC2')
kc2_df_adaptive

Tempo total para df KC2: 3198.0213174819946


Unnamed: 0,k,mean,std,time,df
0,1,54.776557,3.169771,353.22078,KC2
1,2,52.29304,6.397991,359.799462,KC2
2,3,53.252747,3.111642,353.010682,KC2
3,5,50.39011,3.100423,352.378041,KC2
4,7,50.954212,3.796794,352.794893,KC2
5,9,51.152015,1.938084,358.470111,KC2
6,11,48.659341,3.862813,359.656264,KC2
7,13,47.692308,5.527559,353.957198,KC2
8,15,47.686813,4.290004,354.733886,KC2


In [20]:
pc1_df_adaptive = run(X_pc1, Y_pc1, 'adaptive', 'pc1')
pc1_df_adaptive


Tempo total para df pc1: 9451.279965162277


Unnamed: 0,k,mean,std,time,df
0,1,49.2324,1.423229,1054.545997,pc1
1,2,49.319636,3.477774,1040.310866,pc1
2,3,49.684889,2.432478,1020.838834,pc1
3,5,48.245485,2.60848,1046.273393,pc1
4,7,49.683666,1.622132,1055.360662,pc1
5,9,50.407647,2.028823,1060.335409,pc1
6,11,50.043618,2.317209,1063.335384,pc1
7,13,49.234846,2.259838,1043.274689,pc1
8,15,51.307325,1.177927,1067.004731,pc1


# Graficos


In [75]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [92]:
def format_df(df): 
    df['rate'] = df['mean'] / df['time']
    df.set_index('k', inplace=True)

    return df

In [93]:
# kc2_df_1 = format_df(kc2_df_knn)
# pc1_df_1 = format_df(pc1_df_knn)

#kc2_df_2 = format_df(kc2_df_weight)
pc1_df_2 = format_df(pc1_df_weight)

# kc2_df_3 = format_df(kc2_df_adaptive)
# pc1_df_3 = format_df(pc1_df_adaptive)



KeyError: "None of ['k'] are in the columns"

In [95]:
print(kc2_df_1)
print(pc1_df_1)
print(kc2_df_2)
print(pc1_df_2)
print(kc2_df_3)
print(pc1_df_3)


         mean        std        time   df      rate
k                                                  
1   65.097070  25.407862  252.807636  KC2  0.257496
2   72.743590  32.226015  290.484875  KC2  0.250421
3   68.919414  29.443651  298.203528  KC2  0.231115
5   72.536630  32.617817  220.148889  KC2  0.329489
7   73.683150  34.073695  216.911322  KC2  0.339693
9   73.108059  34.234855  227.077673  KC2  0.321952
11  72.915751  35.158931  356.304306  KC2  0.204645
13  71.959707  34.668995  328.961653  KC2  0.218748
15  71.767399  34.558081  358.377032  KC2  0.200257
         mean        std         time   df      rate
k                                                   
1   86.124088  11.536515  1470.327176  PC1  0.058575
2   90.270270  13.001096  1415.996934  PC1  0.063750
3   88.737923  12.394484  1371.914071  PC1  0.064682
5   90.089682  12.708468   756.489119  PC1  0.119089
7   90.990583  12.986461   742.431059  PC1  0.122558
9   92.162162  13.455129   727.684392  PC1  0.126651
11  

# Comparative

In [104]:
def _printComparative(df_1, df_2, df_3):

    fig = make_subplots(rows=3, cols=1,
    row_heights=[0.3, 0.3, 0.3],column_widths=[1],
    specs=[[{"type": "Scatter"}], 
           [{"type": "Scatter"}], 
           [{"type": "Scatter"}]],
    subplot_titles=("Média", "Desvio Padrão", "Tempo"))

    
    fig.add_trace(go.Scatter(x=df_1.index, y = df_1['mean'], name='KNN', mode = 'lines+markers'), col = 1, row = 1) 
    fig.add_trace(go.Scatter(x=df_2.index, y = df_2['mean'], name='Weight', mode = 'lines+markers'), col = 1, row = 1)
    fig.add_trace(go.Scatter(x=df_3.index, y = df_3['mean'], name='Adaptativo', mode = 'lines+markers'), col = 1, row = 1)
     
    fig.add_trace(go.Scatter(x=df_1.index, y = df_1['std'], name='KNN', mode = 'lines+markers'), col = 1, row = 2) 
    fig.add_trace(go.Scatter(x=df_2.index, y = df_2['std'], name='Weight', mode = 'lines+markers'), col = 1, row = 2)
    fig.add_trace(go.Scatter(x=df_3.index, y = df_3['std'], name='Adaptativo', mode = 'lines+markers'), col = 1, row = 2)
     
    fig.add_trace(go.Scatter(x=df_1.index, y = df_1['time'], name='KNN', mode = 'lines+markers'), col = 1, row = 3)
    fig.add_trace(go.Scatter(x=df_2.index, y = df_2['time'], name='Weight', mode = 'lines+markers'), col = 1, row = 3)
    fig.add_trace(go.Scatter(x=df_3.index, y = df_3['time'], name='Adaptativo', mode = 'lines+markers'), col = 1, row = 3)
    
    fig.update_layout(height=1000, width=1000)

    return fig

In [105]:
kc2_fig = _printComparative(kc2_df_1, kc2_df_2, kc2_df_3)
pc1_fig = _printComparative(pc1_df_, pc1_df_2, pc1_df_3)

In [106]:
kc2_fig.show()

In [107]:
pc1_fig.show()

# Relação Tempo vs Acurácia Média

In [100]:
def _printRelation(df_1, df_2, df_3):
    
    fig  =  make_subplots(rows=3, cols=1,
            row_heights=[0.3, 0.3, 0.3],column_widths=[1],
            specs=[[{"type": "xy", "secondary_y": True}], 
                   [{"type": "xy", "secondary_y": True}], 
                   [{"type": "xy", "secondary_y": True}]],
                   subplot_titles=("KNN", "Com peso", "Adaptative"))

    
    fig.add_trace(go.Scatter(x=df_1.index, y = df_1['mean'], name='Média', mode = 'lines+markers'),col = 1, row = 1)
    fig.add_trace(go.Scatter(x=df_1.index, y = df_1['time'], name='Tempo', mode = 'lines+markers'),col = 1, row = 1, secondary_y=True,)
            
    fig.add_trace(go.Scatter(x=df_2.index, y = df_2['mean'], name='Média', mode = 'lines+markers'), col = 1, row = 2) 
    fig.add_trace(go.Scatter(x=df_2.index, y = df_2['time'], name='Tempo', mode = 'lines+markers'),col = 1, row = 2, secondary_y=True,)
            
            
    fig.add_trace(go.Scatter(x=df_3.index, y = df_3['mean'], name='Média', mode = 'lines+markers'), col = 1, row = 3) 
    fig.add_trace(go.Scatter(x=df_3.index, y = df_3['time'], name='Tempo', mode = 'lines+markers'),col = 1, row = 3, secondary_y=True,)
    
    fig.update_layout(height=1000, width=1000, title="Relação Tempo vs Acurácia")

    return fig


In [101]:
kc2_fig = _printRelation(kc2_df_1, kc2_df_2, kc2_df_3)
pc1_fig = _printRelation(pc1_df_3, pc1_df_2, pc1_df_3)

In [102]:
kc2_fig.show()

In [103]:
pc1_fig.show()