https://docs.google.com/forms/d/e/1FAIpQLSc7AeSivMlUKS490NAsBtZ9fweQvBQAZGlR-aKbGq2GIdS-_A/viewform?hr_submission=ChcIrpDc8gMSDwjt08HinAgSBgiGjdbHJxAB&authuser=1

In [1]:
import pandas as pd
import numpy as np

import urllib.request
import io
import time


from scipy.io import arff
from sklearn.model_selection import KFold
from collections import Counter

from sklearn.preprocessing import MinMaxScaler
import re

# Preparação de dados

* extração do dataframe

In [2]:
url = 'http://promise.site.uottawa.ca/SERepository/datasets/pc1.arff'
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
pc1 = pd.DataFrame(data)

Y_pc1 = pc1['defects']
X_pc1 = pc1.drop(columns=['defects'])

#Transforming in boolean class
Y_pc1 = Y_pc1.apply(str).str.replace("b|'", '')
Y_pc1 = Y_pc1.apply(lambda x: True if x == 'true' else False)

#Normalizing values
scaler = MinMaxScaler(feature_range=(0, 1))
x_scaled = scaler.fit_transform(X_pc1)
X_pc1 = pd.DataFrame(x_scaled)

In [3]:
url = 'http://promise.site.uottawa.ca/SERepository/datasets/kc2.arff'
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
kc2 = pd.DataFrame(data)

Y_kc2 = kc2['problems']
X_kc2 = kc2.drop(columns=['problems'])

#Transforming in boolean class
Y_kc2 = Y_kc2.apply(str).str.replace("b|'", '')
Y_kc2 = Y_kc2.apply(lambda x: True if x == 'yes' else False)

#Normalizing values
scaler = MinMaxScaler(feature_range=(0, 1))
x_scaled = scaler.fit_transform(X_kc2)
X_kc2 = pd.DataFrame(x_scaled)

# Algoritmos

In [4]:
def  euclidian_distance(x1, x2):
    x = [x1.iloc[i] for i in range(len(x1))]
    return np.sqrt(np.sum((x-x2)**2))


## KNN 


In [5]:
class KNN:

    def __init__(self, k):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y


    def predict(self, X):
        predicted_labels = []
        for _, row in X.iterrows():
            predicted_labels.append(self.predict_func(row))

        return np.array(predicted_labels)
        
    def predict_func(self, x):
        distances = []
        for _, row in self.X_train.iterrows():
            distances.append(euclidian_distance(x, row))
          
        k_indices = np.argsort(distances)[:self.k]        
        k_nearest_labels = [self.y_train.iloc[i] for i in k_indices]

        unique, counts = np.unique(k_nearest_labels, return_counts=True)
        predicted = unique[counts.argmax()]

        return predicted

In [6]:
def run(X, Y, df, 1):
    kf = KFold(n_splits=10)

    results = []
    totalTime = 0

    for k in [1,2,3,5,7,9,11,13,15]:
        begin = time.time()

        accuracies = []
        knn = KNN(k=k)

        for train, test in kf.split(X):
            knn.fit(X.iloc[train], Y.iloc[train])
            predictions = knn.predict(X.iloc[test])

            acc = (np.sum(predictions == Y.iloc[test]) / len(test)) * 100
            
            accuracies.append(acc)
            end = time.time()
            totalTime += end-begin
            temp= {'k': k,
                'mean': np.mean(accuracies),
                'std': np.std(accuracies),
                'time':end - begin,
                'df': df}

        results.append(temp)


    print("Tempo total para df {}: {}".format(df, totalTime))

    return pd.DataFrame(results)
  

In [7]:
kc2_df_knn = run(X_kc2, Y_kc2, 'KC2')
kc2_df_knn


Tempo total para df KC2: 9456.348000526428


Unnamed: 0,k,mean,std,time,df
0,1,70.667634,19.064782,187.24513,KC2
1,2,76.186502,27.794459,181.572228,KC2
2,3,74.484761,22.780157,180.815501,KC2
3,5,78.127721,26.246144,183.4653,KC2
4,7,78.501451,26.629693,184.169111,KC2
5,9,78.87881,26.536029,217.081368,KC2
6,11,77.9209,27.900475,225.125783,KC2
7,13,76.959361,29.559018,190.711529,KC2
8,15,76.37881,28.889339,187.469172,KC2


In [8]:
pc1_df_knn = run(X_pc1, Y_pc1, 'PC1')
pc1_df_knn


Tempo total para df PC1: 29379.468264579773


Unnamed: 0,k,mean,std,time,df
0,1,87.56593,19.19822,770.739493,PC1
1,2,91.351351,20.318062,576.887565,PC1
2,3,89.099099,19.743711,560.583727,PC1
3,5,90.720721,20.108665,558.40554,PC1
4,7,91.261261,20.257453,560.298419,PC1
5,9,91.981982,20.482176,558.72684,PC1
6,11,92.072072,20.523147,562.071414,PC1
7,13,92.072072,20.523147,565.197404,PC1
8,15,92.072072,20.527102,567.341444,PC1


## KNN com peso


In [9]:
class weight_KNN:

    def __init__(self, k):
        self.k = k
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def weight(self,weighted, k_nearest_labels):
        pos = 0
        neg = 0
        for i in range(len(weighted)):  
            if k_nearest_labels[i] == True:
                pos += weighted[i]
            else:
                neg += weighted[i]
        return True if pos > neg else False
    
    def distance(self, x):
        distances = []
        for _, row in self.X_train.iterrows():
            distances.append(euclidian_distance(x, row))
        return distances

    def predict(self, X):
        predicted_labels = []
        for _, row in X.iterrows():
            predicted_labels.append(self.predict_func(row))

        return np.array(predicted_labels)

    def predict_func(self, x):
        distances = []
        for _, row in self.X_train.iterrows():
            distances.append(euclidian_distance(x, row))
            
        k_indices = np.argsort(distances)[:self.k]        
        k_nearest_labels = [self.y_train.iloc[i] for i in k_indices]

        weighted = [(1/(k**2) if k!= 0 else 0) for k in k_indices]
        predicted = self.weight(weighted, k_nearest_labels)
        
        return predicted

In [10]:
def run_weight(X, Y, df):
    kf = KFold(n_splits=10)

    results = []
    totalTime = 0

    for k in [1,2,3,5,7,9,11,13,15]:
        begin = time.time()

        accuracies = []
        knn = weight_KNN(k=k)

        for train, test in kf.split(X):
            knn.fit(X.iloc[train], Y.iloc[train])
            predictions = knn.predict(X.iloc[test])

            acc = (np.sum(predictions == Y.iloc[test]) / len(test)) * 100
            accuracies.append(acc)
            end = time.time()
            totalTime += end-begin
            temp= {'k': k,
                'mean': np.mean(accuracies),
                'std': np.std(accuracies),
                'time':end - begin,
                'df': df}

        results.append(temp)


    print("Tempo total para df {}: {}".format(df, totalTime))

    return pd.DataFrame(results)
  

In [11]:
kc2_df_weight = run_weight(X_kc2, Y_kc2, 'KC2')
kc2_df_weight


Tempo total para df KC2: 6177.620866060257


Unnamed: 0,k,mean,std,time,df
0,1,70.667634,19.064782,123.916963,KC2
1,2,76.190131,27.122421,123.332981,KC2
2,3,80.036284,26.314917,123.849781,KC2
3,5,81.360668,29.525742,123.969609,KC2
4,7,81.364296,29.937332,124.557069,KC2
5,9,81.941219,29.92648,125.288208,KC2
6,11,82.510885,29.827803,125.126495,KC2
7,13,82.510885,29.840199,125.281602,KC2
8,15,83.276488,29.2452,124.309816,KC2


In [12]:
pc1_df = run_weight(X_pc1, Y_pc1, 'pc1')
pc1_df


Tempo total para df pc1: 27857.931182861328


Unnamed: 0,k,mean,std,time,df
0,1,87.56593,19.19822,567.144455,pc1
1,2,83.330876,18.169756,564.602772,pc1
2,3,80.357084,17.755536,564.402553,pc1
3,5,74.679771,17.678081,560.845669,pc1
4,7,69.722359,17.723678,566.622364,pc1
5,9,65.665029,17.364219,555.47315,pc1
6,11,62.059787,17.149154,557.994298,pc1
7,13,59.447174,17.635201,562.600748,pc1
8,15,56.559378,17.434812,563.249611,pc1


## KNN Adaptativo


In [16]:
import random
class Adaptive_KNN:

    def __init__(self, k):
        self.k = k
        
    def adaptive_rule(self):
        radius = []
        for idx in range(len(self.y_train)):
            temp = []
            for jdx in range(len(self.y_train)):
                if self.y_train.iloc[idx] != self.y_train.iloc[jdx]:
                    dist = euclidian_distance(self.X_train.iloc[idx], self.X_train.iloc[jdx]) - 0.00000000001
                    if dist >= 0:
                        temp.append(dist)
            radius.append(min(temp))
        return radius
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.radius = self.adaptive_rule()


    def predict(self, X):
        predicted_labels = []
        for _, row in X.iterrows():
            predicted_labels.append(self.predict_func(row))

        return np.array(predicted_labels)
        
    def predict_label(self, k_nearest_radius, pos=0, neg=0):
        for k in range(len(k_nearest_radius)):
            if k_nearest_radius[k] == 99999:
                pass
            elif self.y_train.iloc[k] == True:
                ++pos
            elif self.y_train.iloc[k] == False:
                ++neg
        if pos > neg:
            return True
        elif neg > pos:
            return False
        else: 
            return True if random.randint(1,100) % 2 == 0 else False
        
    def predict_func(self, x):
        distances = []
        for _, row in self.X_train.iterrows():
            calc = euclidian_distance(x, row)
            distances.append(calc)
        
        rad_distance = []
        max_dist = 99999
        for d in range(len(distances)):
            aux = distances[d] / self.radius[d]
            if(aux > 1):
                rad_distance.append(max_dist)
            else:
                rad_distance.append(aux)

        k_indices = np.argsort(rad_distance)[:self.k]        
        k_nearest_radius = [self.y_train.iloc[i] for i in k_indices]

        return self.predict_label(k_nearest_radius,0,0)

In [17]:
def run_3(X, Y, df):
    kf = KFold(n_splits=10)

    results = []
    totalTime = 0

    for k in [1,2,3,5,7,9,11,13,15]:
        begin = time.time()

        accuracies = []
        knn = Adaptive_KNN(k=k)

        for train, test in kf.split(X):
            knn.fit(X.iloc[train], Y.iloc[train])
            predictions = knn.predict(X.iloc[test])

            acc = (np.sum(predictions == Y.iloc[test]) / len(test)) * 100
            accuracies.append(acc)
        end = time.time()
        totalTime += end-begin
        temp= {'k': k,
            'mean': np.mean(accuracies),
            'std': np.std(accuracies),
            'time':end - begin,
            'df': df}  


        results.append(temp)


    print("Tempo total para df {}: {}".format(df, totalTime))

    return pd.DataFrame(results)
  

In [None]:
kc2_df_3  = run_3(X_kc2, Y_kc2, 'KC2')
kc2_df_3

In [None]:
kc2_df_3  = run_3(X_kc2, Y_kc2, 'KC2')
kc2_df_3

# Graficos


In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [None]:
def format_df(df): 
    df['max'] = df['mean'] + df['std']
    df['min'] = df['mean'] - df['std']
    df_1.set_index('k', inplace=True)

    return df

In [None]:
kc2_df_1 = format_df(kc2_df_1)
pc1_df_1 = format_df(pc1_df_1)

kc2_df_2 = format_df(kc2_df_2)
pc1_df_2 = format_df(pc1_df_2)

kc2_df_3 = format_df(kc2_df_3)
pc1_df_3 = format_df(pc1_df_3)



In [None]:
def _print2(df_1, df_2, df_3)

    fig = make_subplots(rows=2, cols=3,
    column_widths=[0.3, 0.3, 0.3],row_heights=[0.5, 0.5],
    specs=[[{"type": "Scatter"}, {"type": "Scatter"}, {"type": "Scatter"}],
           [{"type": "Scatter"}, {"type": "Scatter"}, {"type": "Scatter"}]])

    fig = go.Figure()
    
    fig.add_trace(go.Scatter(y = df_1['mean'], name='média - KNN', mode = 'lines+markers'), row = 1, col = 1)
    fig.add_trace(go.Scatter(y = df_1['max'], name='std - KNN',mode = 'lines',line=dict(width=1, dash='dash')), row = 1, col = 1)
    fig.add_trace(go.Scatter(y = df_1['min'], name='std - KNN', mode = 'lines',line=dict(width=1, dash='dash')), row = 1, col = 1)
    
    fig.add_trace(go.Scatter(y = df_2['mean'], name='média - com peso', mode = 'lines+markers'), row = 1, col = 1)
    fig.add_trace(go.Scatter(y = df_2['max'], name='std - com peso',mode = 'lines',line=dict(width=1, dash='dash')), row = 1, col = 2)
    fig.add_trace(go.Scatter(y = df_2['min'], name='std - com peso', mode = 'lines',line=dict(width=1, dash='dash')), row = 1, col = 2)
    
    fig.add_trace(go.Scatter(y = df_3['mean'], name='média - adaptativo', mode = 'lines+markers'), row = 1, col = 1)
    fig.add_trace(go.Scatter(y = df_3['max'], name='std - adaptativo',mode = 'lines',line=dict(width=1, dash='dash')), row = 1, col = 3)
    fig.add_trace(go.Scatter(y = df_3['min'], name='std - adaptativo', mode = 'lines',line=dict(width=1, dash='dash')), row = 1, col = 3)
    
    fig.add_trace(go.Scatter(y = df_3['time'], name='tempo - KNN', mode = 'lines+markers'), row = 2, col = 1)
    fig.add_trace(go.Scatter(y = df_3['time'], name='tempo - com peso', mode = 'lines+markers'), row = 2, col = 2)
    fig.add_trace(go.Scatter(y = df_3['time'], name='tempo - adaptativo', mode = 'lines+markers'), row = 2, col = 3)

    return fig

In [None]:
kc2_fig = _print2(kc2_df_1, kc2_df_2, kc2_df_3)
pc1_fig = _print2(pc1_df_3, pc1_df_2, pc1_df_3)

In [None]:
kc2_fig.show()

In [None]:
pc1_fig.show()