https://docs.google.com/forms/d/e/1FAIpQLSc7AeSivMlUKS490NAsBtZ9fweQvBQAZGlR-aKbGq2GIdS-_A/viewform?hr_submission=ChcIrpDc8gMSDwjt08HinAgSBgiGjdbHJxAB&authuser=1

In [1]:
import pandas as pd
import numpy as np

import urllib.request
import io
import time


from scipy.io import arff
from sklearn.model_selection import KFold
from collections import Counter

from sklearn.preprocessing import MinMaxScaler
import re

# Preparação de dados

* extração do dataframe

In [2]:
url = 'http://promise.site.uottawa.ca/SERepository/datasets/pc1.arff'
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
pc1 = pd.DataFrame(data)

Y_pc1 = pc1['defects']
X_pc1 = pc1.drop(columns=['defects'])

#Transforming in boolean class
Y_pc1 = Y_pc1.apply(str).str.replace("b|'", '')
Y_pc1 = Y_pc1.apply(lambda x: True if x == 'true' else False)

#Normalizing values
scaler = MinMaxScaler(feature_range=(0, 1))
x_scaled = scaler.fit_transform(X_pc1)
X_pc1 = pd.DataFrame(x_scaled)

In [3]:
url = 'http://promise.site.uottawa.ca/SERepository/datasets/kc2.arff'
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
kc2 = pd.DataFrame(data)

Y_kc2 = kc2['problems']
X_kc2 = kc2.drop(columns=['problems'])

#Transforming in boolean class
Y_kc2 = Y_kc2.apply(str).str.replace("b|'", '')
Y_kc2 = Y_kc2.apply(lambda x: True if x == 'yes' else False)

#Normalizing values
scaler = MinMaxScaler(feature_range=(0, 1))
x_scaled = scaler.fit_transform(X_kc2)
X_kc2 = pd.DataFrame(x_scaled)

# Algoritmos

In [4]:
def  euclidian_distance(x1, x2):
    x = [x1.iloc[i] for i in range(len(x1))]
    return np.sqrt(np.sum((x-x2)**2))


In [37]:
def define_knn(algorithm, k):
    if algorithm == 'adaptive':
        return adaptive_KNN(k=k)
    if algorithm ==  'weight':
        return weight_KNN(k=k)
    return KNN(k=k)

In [6]:
def run(X, Y, algorithm, df):
    kf = KFold(n_splits=10)

    results = []
    totalTime = 0

    for k in [1,2,3,5,7,9,11,13,15]:
        begin = time.time()

        accuracies = []
        knn = define_knn(algorithm, k)

        for train, test in kf.split(X):
            knn.fit(X.iloc[train], Y.iloc[train])
            predictions = knn.predict(X.iloc[test])

            acc = (np.sum(predictions == Y.iloc[test]) / len(test)) * 100
            accuracies.append(acc)
        end = time.time()
        totalTime += end-begin
        temp= {'k': k,
            'mean': np.mean(accuracies),
            'std': np.std(accuracies),
            'time':end - begin,
            'df': df}

        results.append(temp)


    print("Tempo total para df {}: {}".format(df, totalTime))

    return pd.DataFrame(results)
  

## KNN 


In [7]:
class KNN:

    def __init__(self, k):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y


    def predict(self, X):
        predicted_labels = []
        for _, row in X.iterrows():
            predicted_labels.append(self.predict_func(row))

        return np.array(predicted_labels)
        
    def predict_func(self, x):
        distances = []
        for _, row in self.X_train.iterrows():
            distances.append(euclidian_distance(x, row))
          
        k_indices = np.argsort(distances)[:self.k]        
        k_nearest_labels = [self.y_train.iloc[i] for i in k_indices]

        unique, counts = np.unique(k_nearest_labels, return_counts=True)
        predicted = unique[counts.argmax()]

        return predicted

In [8]:
kc2_df_knn = run(X_kc2, Y_kc2, 'knn', 'KC2')
kc2_df_knn


Tempo total para df KC2: 2060.221460342407


Unnamed: 0,k,mean,std,time,df
0,1,70.667634,19.064782,213.206993,KC2
1,2,76.186502,27.794459,220.603117,KC2
2,3,74.484761,22.780157,269.800265,KC2
3,5,78.127721,26.246144,234.947282,KC2
4,7,78.501451,26.629693,219.454844,KC2
5,9,78.87881,26.536029,211.373539,KC2
6,11,77.9209,27.900475,257.128017,KC2
7,13,76.959361,29.559018,213.338469,KC2
8,15,76.37881,28.889339,220.368934,KC2


In [9]:
pc1_df_knn = run(X_pc1, Y_pc1,'knn', 'PC1')
pc1_df_knn


Tempo total para df PC1: 11465.906616926193


Unnamed: 0,k,mean,std,time,df
0,1,87.56593,19.19822,976.369106,PC1
1,2,91.351351,20.318062,901.059239,PC1
2,3,89.099099,19.743711,856.642864,PC1
3,5,90.720721,20.108665,1250.178361,PC1
4,7,91.261261,20.257453,1240.413508,PC1
5,9,91.981982,20.482176,1314.742437,PC1
6,11,92.072072,20.523147,1679.3209,PC1
7,13,92.072072,20.523147,1687.926765,PC1
8,15,92.072072,20.527102,1559.253436,PC1


## KNN com peso


In [38]:
class weight_KNN:

    def __init__(self, k):
        self.k = k
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def weight(self,weighted, k_nearest_labels):
        pos = 0
        neg = 0
        for i in range(len(weighted)):  
            if k_nearest_labels[i] == True:
                pos += weighted[i]
            else:
                neg += weighted[i]
        return True if pos > neg else False
    
    def distance(self, x):
        distances = []
        for _, row in self.X_train.iterrows():
            distances.append(euclidian_distance(x, row))
        return distances

    def predict(self, X):
        predicted_labels = []
        for _, row in X.iterrows():
            predicted_labels.append(self.predict_func(row))

        return np.array(predicted_labels)

    def predict_func(self, x):
        distances = []
        for _, row in self.X_train.iterrows():
            distances.append(euclidian_distance(x, row))
            
        k_indices = np.argsort(distances)[:self.k]        
        k_nearest_labels = [self.y_train.iloc[i] for i in k_indices]

        weighted = [(1/(k**2) if k!= 0 else 0) for k in k_indices]
        predicted = self.weight(weighted, k_nearest_labels)
        
        return predicted

In [39]:
kc2_df_weight = run(X_kc2, Y_kc2, 'weight','KC2')
kc2_df_weight
#tava kc2_df

Tempo total para df KC2: 2290.6523818969727


Unnamed: 0,k,mean,std,time,df
0,1,70.667634,19.064782,163.839108,KC2
1,2,76.190131,27.122421,223.706293,KC2
2,3,80.036284,26.314917,229.428169,KC2
3,5,81.360668,29.525742,209.589949,KC2
4,7,81.364296,29.937332,246.290663,KC2
5,9,81.941219,29.92648,259.106779,KC2
6,11,82.510885,29.827803,236.214994,KC2
7,13,82.510885,29.840199,358.258228,KC2
8,15,83.276488,29.2452,364.2182,KC2


In [47]:
pc1_df_weight = run(X_pc1, Y_pc1,'weight', 'pc1')
pc1_df_weight

Tempo total para df pc1: 5889.178629398346


Unnamed: 0,k,mean,std,time,df
0,1,87.56593,19.19822,798.530815,pc1
1,2,83.330876,18.169756,674.359384,pc1
2,3,80.357084,17.755536,676.017991,pc1
3,5,74.679771,17.678081,680.25683,pc1
4,7,69.722359,17.723678,665.444334,pc1
5,9,65.665029,17.364219,629.344182,pc1
6,11,62.059787,17.149154,588.551615,pc1
7,13,59.447174,17.635201,589.551254,pc1
8,15,56.559378,17.434812,587.122225,pc1


## KNN Adaptativo


In [20]:
import random

max_dist = 99999

class adaptive_KNN:

    def __init__(self, k):
        self.k = k
        
    def adaptive_rule(self):
        epsilon = 0.00000000001
        radius = []
        for idx in range(len(self.y_train)):
            temp = []
            for jdx in range(len(self.y_train)):
                if self.y_train.iloc[idx] != self.y_train.iloc[jdx]:
                    dist = euclidian_distance(self.X_train.iloc[idx], self.X_train.iloc[jdx]) - epsilon
                    if dist >= 0:
                        temp.append(dist)
            if len(temp) == 0:
                temp.append(max_dist)
            radius.append(min(temp))
        return radius
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.radius = self.adaptive_rule()


    def predict(self, X):
        predicted_labels = []
        for _, row in X.iterrows():
            predicted_labels.append(self.predict_func(row))

        return np.array(predicted_labels)
        
    def predict_label(self, k_nearest_radius, pos=0, neg=0):
        for k in range(len(k_nearest_radius)):
            if k_nearest_radius[k] == 99999:
                pass
            elif self.y_train.iloc[k] == True:
                ++pos
            elif self.y_train.iloc[k] == False:
                ++neg
        if pos > neg:
            return True
        elif neg > pos:
            return False
        else: 
            return True if random.randint(1,100) % 2 == 0 else False
        
    def predict_func(self, x):
        distances = []
        for _, row in self.X_train.iterrows():
            calc = euclidian_distance(x, row)
            distances.append(calc)
        
        rad_distance = []
        max_dist = 99999
        for d in range(len(distances)):
            aux = distances[d] / self.radius[d]
            if(aux > 1):
                rad_distance.append(max_dist)
            else:
                rad_distance.append(aux)

        k_indices = np.argsort(rad_distance)[:self.k]        
        k_nearest_radius = [self.y_train.iloc[i] for i in k_indices]

        return self.predict_label(k_nearest_radius,0,0)

In [21]:
kc2_df_adaptive  = run(X_kc2, Y_kc2, 'adaptive', 'KC2')
kc2_df_adaptive

Tempo total para df KC2: 6670.185385465622


Unnamed: 0,k,mean,std,time,df
0,1,50.761974,4.584698,724.515939,KC2
1,2,49.034833,3.211461,742.638317,KC2
2,3,49.401306,7.282109,744.234755,KC2
3,5,48.272859,6.030632,749.587077,KC2
4,7,50.965167,5.569878,747.844589,KC2
5,9,49.027576,4.023547,746.985497,KC2
6,11,49.052975,4.061639,751.480814,KC2
7,13,48.095065,6.550452,735.703267,KC2
8,15,51.164731,6.924774,727.195131,KC2


In [22]:
pc1_df_adaptive = run(X_pc1, Y_pc1, 'adaptive', 'pc1')
pc1_df_adaptive


Tempo total para df pc1: 15583.800250291824


Unnamed: 0,k,mean,std,time,df
0,1,50.495495,2.7507,1845.983769,pc1
1,2,49.23751,3.815599,1889.33604,pc1
2,3,51.400491,3.0088,1872.38404,pc1
3,5,48.338247,4.636656,1808.495654,pc1
4,7,50.400491,4.925641,1641.389966,pc1
5,9,48.331695,4.522763,1642.428043,pc1
6,11,49.592957,2.48821,1640.094107,pc1
7,13,50.044226,3.503615,1641.703053,pc1
8,15,50.043407,3.62941,1601.985579,pc1


# Graficos


In [48]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [101]:
def format_df(df): 
    df['rate'] = df['mean']/df['time']
    df.set_index('k', inplace=True)

    return df

In [112]:
kc2_df_1 = format_df(kc2_df_knn)
pc1_df_1 = format_df(pc1_df_knn)

kc2_df_2 = format_df(kc2_df_weight)
pc1_df_2 = format_df(pc1_df_weight)

kc2_df_3 = format_df(kc2_df_adaptive)
pc1_df_3 = format_df(pc1_df_adaptive)



In [113]:
print(kc2_df_1)
print(pc1_df_1)
print(kc2_df_2)
print(pc1_df_2)
print(kc2_df_3)
print(pc1_df_3)


         mean        std        time   df      rate
k                                                  
1   70.667634  19.064782  213.206993  KC2  0.331451
2   76.186502  27.794459  220.603117  KC2  0.345356
3   74.484761  22.780157  269.800265  KC2  0.276074
5   78.127721  26.246144  234.947282  KC2  0.332533
7   78.501451  26.629693  219.454844  KC2  0.357711
9   78.878810  26.536029  211.373539  KC2  0.373173
11  77.920900  27.900475  257.128017  KC2  0.303043
13  76.959361  29.559018  213.338469  KC2  0.360738
15  76.378810  28.889339  220.368934  KC2  0.346595
         mean        std         time   df      rate
k                                                   
1   87.565930  19.198220   976.369106  PC1  0.089685
2   91.351351  20.318062   901.059239  PC1  0.101382
3   89.099099  19.743711   856.642864  PC1  0.104010
5   90.720721  20.108665  1250.178361  PC1  0.072566
7   91.261261  20.257453  1240.413508  PC1  0.073573
9   91.981982  20.482176  1314.742437  PC1  0.069962
11  

# Comparativo

In [136]:
def _printComparative(df_1, df_2, df_3):

    fig = make_subplots(rows=3, cols=1,
    row_heights=[0.3, 0.3, 0.3],column_widths=[1],
    specs=[[{"type": "Scatter"}], 
           [{"type": "Scatter"}], 
           [{"type": "Scatter"}]],
    subplot_titles=("Média", "Desvio Padrão", "Tempo"))

    
    fig.add_trace(go.Scatter(x=df_1.index, y = df_1['mean'], name='KNN', mode = 'lines+markers'), col = 1, row = 1) 
    fig.add_trace(go.Scatter(x=df_1.index, y = df_2['mean'], name='Weight', mode = 'lines+markers'), col = 1, row = 1)
    fig.add_trace(go.Scatter(x=df_1.index, y = df_3['mean'], name='Adaptativo', mode = 'lines+markers'), col = 1, row = 1)
     
    fig.add_trace(go.Scatter(x=df_2.index, y = df_1['std'], name='KNN', mode = 'lines+markers'), col = 1, row = 2) 
    fig.add_trace(go.Scatter(x=df_2.index, y = df_2['std'], name='Weight', mode = 'lines+markers'), col = 1, row = 2)
    fig.add_trace(go.Scatter(x=df_2.index, y = df_3['std'], name='Adaptativo', mode = 'lines+markers'), col = 1, row = 2)
     
    fig.add_trace(go.Scatter(x=df_3.index, y = df_1['time'], name='KNN', mode = 'lines+markers'), col = 1, row = 3)
    fig.add_trace(go.Scatter(x=df_3.index, y = df_2['time'], name='Weight', mode = 'lines+markers'), col = 1, row = 3)
    fig.add_trace(go.Scatter(x=df_3.index, y = df_3['time'], name='Adaptativo', mode = 'lines+markers'), col = 1, row = 3)
    
    fig.update_layout(height=1000, width=1000)

    return fig

In [137]:
kc2_fig = _printComparative(kc2_df_1, kc2_df_2, kc2_df_3)
pc1_fig = _printComparative(pc1_df_1, pc1_df_2, pc1_df_3)

In [138]:
kc2_fig.show()

In [139]:
pc1_fig.show()

# Relação Tempo vs Acurrácia média

In [130]:
def _printRelation(df_1, df_2, df_3):
    
    fig  =  make_subplots(rows=3, cols=1,
            row_heights=[0.3, 0.3, 0.3],column_widths=[1],
            specs=[[{"type": "xy", "secondary_y": True}], 
                   [{"type": "xy", "secondary_y": True}], 
                   [{"type": "xy", "secondary_y": True}]],
                   subplot_titles=("KNN", "Com peso", "Adaptative"))

    
    fig.add_trace(go.Scatter(x=df_1.index, y = df_1['mean'], name='Média', mode = 'lines+markers'),col = 1, row = 1)
    fig.add_trace(go.Scatter(x=df_1.index, y = df_1['time'], name='Tempo', mode = 'lines+markers'),col = 1, row = 1, secondary_y=True,)
            
    fig.add_trace(go.Scatter(x=df_2.index, y = df_2['mean'], name='Média', mode = 'lines+markers'), col = 1, row = 2) 
    fig.add_trace(go.Scatter(x=df_2.index, y = df_2['time'], name='Tempo', mode = 'lines+markers'),col = 1, row = 2, secondary_y=True,)
            
            
    fig.add_trace(go.Scatter(x=df_3.index, y = df_3['mean'], name='Média', mode = 'lines+markers'), col = 1, row = 3) 
    fig.add_trace(go.Scatter(x=df_3.index, y = df_3['time'], name='Tempo', mode = 'lines+markers'),col = 1, row = 3, secondary_y=True,)
    
    fig.update_layout(height=1000, width=1000, title="Relação Tempo vs Acurácia")

    return fig


In [134]:
kc2_fig = _printRelation(kc2_df_1, kc2_df_2, kc2_df_3)
pc1_fig = _printRelation(pc1_df_1, pc1_df_2, pc1_df_3)

In [135]:
kc2_fig.show()

In [133]:
pc1_fig.show()