In [1]:
! pip install kneed

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://packagecloud.io/github/git-lfs/pypi/simple
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
import numpy as np
import pandas as pd
import plotly.express as px

import io
import urllib.request
from scipy.io import arff

from sklearn.preprocessing import MinMaxScaler
import math
import re

from kneed import KneeLocator
from sklearn.metrics import silhouette_score
from scipy.stats import hmean

from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 

In [3]:
url = 'http://promise.site.uottawa.ca/SERepository/datasets/kc2.arff'
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
df = pd.DataFrame(data)
column = 'problems'

#Transforming in boolean class
df[column] = df[column].apply(str).str.replace("b|'", '')
df[column] = df[column].apply(lambda x: 1 if x == 'yes' else 0)

#Normalizing values
df

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,lOCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,problems
0,1.1,1.4,1.4,1.4,1.3,1.30,1.30,1.30,1.30,1.30,...,2.0,2.0,2.0,2.0,1.2,1.2,1.2,1.2,1.4,0
1,1.0,1.0,1.0,1.0,1.0,1.00,1.00,1.00,1.00,1.00,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
2,415.0,59.0,50.0,51.0,1159.0,8411.31,0.01,103.53,81.24,870848.58,...,359.0,35.0,9.0,10.0,47.0,106.0,692.0,467.0,106.0,1
3,230.0,33.0,10.0,16.0,575.0,3732.82,0.03,39.82,93.74,148644.06,...,174.0,15.0,34.0,5.0,23.0,67.0,343.0,232.0,65.0,1
4,175.0,26.0,12.0,13.0,500.0,3123.96,0.03,29.48,105.96,92103.07,...,142.0,7.0,19.0,4.0,18.0,58.0,310.0,190.0,51.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,4.0,1.0,1.0,1.0,5.0,11.61,0.50,2.00,5.80,23.22,...,2.0,0.0,0.0,0.0,4.0,1.0,4.0,1.0,1.0,1
518,4.0,1.0,1.0,1.0,4.0,8.00,0.67,1.50,5.33,12.00,...,2.0,0.0,0.0,0.0,3.0,1.0,3.0,1.0,1.0,1
519,4.0,1.0,1.0,1.0,4.0,8.00,0.67,1.50,5.33,12.00,...,2.0,0.0,0.0,0.0,3.0,1.0,3.0,1.0,1.0,1
520,4.0,1.0,1.0,1.0,5.0,11.61,0.67,1.50,7.74,17.41,...,2.0,0.0,0.0,0.0,3.0,2.0,3.0,2.0,1.0,1


In [4]:
df_true = df.loc[df[column] == True]
df_false = df.loc[df[column]== False]

df = pd.concat([df_false, df_true])

df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,lOCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,problems
0,1.1,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,1.3,...,2.0,2.0,2.0,2.0,1.2,1.2,1.2,1.2,1.4,0
1,3.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0
2,14.0,2.0,1.0,2.0,22.0,88.0,0.17,5.79,15.21,509.14,...,8.0,0.0,1.0,0.0,9.0,7.0,13.0,9.0,3.0,0
3,10.0,2.0,1.0,2.0,18.0,64.53,0.14,7.0,9.22,451.71,...,8.0,0.0,0.0,0.0,8.0,4.0,11.0,7.0,3.0,0
4,8.0,1.0,1.0,1.0,10.0,31.7,0.5,2.0,15.85,63.4,...,3.0,0.0,0.0,1.0,4.0,5.0,5.0,5.0,1.0,0


In [5]:
def attr_class(df, column):
    Y_df = df[column]
    X_df = df.drop(columns=[column])
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    x_scaled = scaler.fit_transform(X_df)
    X_df = pd.DataFrame(x_scaled)
    return X_df, Y_df


In [6]:
def reduce_dimensions(df, n_reduced, _column):
    df.reset_index(inplace=True, drop=True)

    pca = PCA(n_components=n_reduced)
    x, y = attr_class(df, _column)
    df_reduced = pca.fit_transform(x[x.columns].values)
    df_reduced = pd.DataFrame(df_reduced, columns=['dim_1', 'dim_2'])
    
    df_reduced['class'] = y
    return df_reduced

def plot_reduced(df, _column, title):
    df_plot = reduce_dimensions(df, 2, _column)
    fig = px.scatter(df_plot, x='dim_1', y='dim_2', color='class', title=title)
#     fig.show()
    

In [7]:
plot_reduced(df, column, "dataframe inicial")


In [8]:
#functions related to elbow definition
def plot_elbow(wcss, title):
    
    df = pd.DataFrame(wcss, columns=['wcss'])
    df['clusters'] = [2,3,4,5,6]
    fig = px.line(df, x = 'clusters', y='wcss', title = title)
#     fig.show()

def get_best_k(k_range, sse):
    kl = KneeLocator(k_range, sse, curve="convex", direction="decreasing")
    return kl.elbow


In [9]:
#define dataframe to true and false classes
def true_false_dfs(df):
    df_true = df.loc[df[column] == True]
    df_true =  df_true.drop(columns=column)

    df_false = df.loc[df[column]== False]
    df_false = df_false.drop(columns=column)
    
    return df_true, df_false

In [10]:
def elbow(df, title):
    print(title)
    print(df.shape)
    wsse = []
    for k in [2,3,4,5,6]:
        if(k <= len(df)):
            kmeans = KMeans(n_clusters=k, random_state=0)
            predict = kmeans.fit_transform(df)
            sse = kmeans.inertia_
            wsse.append(sse)
        else:
            wsse.append(0)
    plot_elbow(wsse, "elbow - {}".format(title))
    elbow = get_best_k([2,3,4,5,6], wsse)
    print("{}: {}".format(title, elbow))
    return elbow
        

In [11]:
def kmeans_func(df, elbow, title):
    kmeans = KMeans(n_clusters=elbow, random_state=0)
    predict = kmeans.fit_predict(df)
    df['meta_class'] = predict
    plot_reduced(df, 'meta_class', "Data Frame - {}".format(title)) 
    return df

In [12]:
def df_metaclass_func(df):
    
    df_true, df_false = true_false_dfs(df)
    
    if(len(df_true != 0)):
        elbow_true = elbow(df_true, "True")
        df_true = kmeans_func(df_true, elbow_true, "True")
        df_true['meta_class'] = "t_" + df_true['meta_class'].astype(str)
    else:
        df_true['meta_class'] = []
        
    if(len(df_false != 0)):
        elbow_false = elbow(df_false, "False")
        df_false = kmeans_func(df_false, elbow_false, "False")
        df_false['meta_class'] = "f_" + df_false['meta_class'].astype(str)
    else:
        df_false['meta_class'] = []

    df_res = pd.concat([df_false, df_true])
    
    plot_reduced(df_res, 'meta_class', "dataframe - metaclass")
    return df_res

In [13]:
def back_values(df):
    df = pd.Series(df)
    df = df.replace(regex='f_.*', value=0)
    df = df.replace(regex='t_.*', value=1)
    return df.to_numpy()

def dictionary(title, arr):
    dic = {
    "algoritmo": title,
    "media":  np.mean(arr),
    "std":  np.std(arr),
    "media harmonica": hmean(arr)
    }
    return dic

In [14]:
def knn(X, Y, train, test):
    knn = KNeighborsClassifier(n_neighbors=1)
    knn.fit(X.iloc[train], Y.iloc[train])
    knn_pred = knn.predict(X.iloc[test])
    acc = accuracy(knn_pred, Y.iloc[test])
    return acc

def nb(X, Y, train, test):
    nb_normal = GaussianNB()
    nb_normal.fit(X.iloc[train], Y.iloc[train])
    nb_pred = nb_normal.predict(X.iloc[test])
    acc = accuracy(nb_pred, Y.iloc[test])
    return acc


In [15]:
#general functions

def accuracy(predicted, real):
    return (np.sum(predicted == real) / len(real)) * 100


In [16]:
def run(df_original):
    knn_acc = []
    nb_acc  = []
    nbk_acc = []
    nbk_acc2 = []
    fold = []

    index = 1
    kf = KFold(n_splits=5)
    
    for train, test in kf.split(df_original):
        print("_____{}_____".format(index))
        df = df_original.sample(frac=1, random_state=1)
        
        X, Y = attr_class(df, column)
        
        acc1 = knn(X, Y, train, test)
        knn_acc.append(acc1)

        acc2 = nb(X, Y, train, test)
        nb_acc.append(acc2)
        
        df_meta = df_metaclass_func(df_original.iloc[train])
        df_meta = df_meta.sample(frac=1, random_state=1)

        X_meta, Y_meta = attr_class(df_meta, 'meta_class')
        
        nb_normal = GaussianNB()
        nb_normal.fit(X_meta, Y_meta)
        nb_pred = nb_normal.predict(X.iloc[test])
        nb_pred_binary = back_values(nb_pred)
        acc3 = accuracy(nb_pred_binary, Y.iloc[test])
        nbk_acc2.append(acc3)

        fold.append({
            "fold": index,
            'knn acc': acc1,
            'nb acc': acc2,
            'kmeans_nb acc': acc3
        })
        index = index+1

    results = []

    results.append(dictionary('1-NN', knn_acc))
    results.append(dictionary("NB (sem KMeans)", nb_acc))
    results.append(dictionary("NB clean(com KMeans)", nbk_acc2))


    accuracies = pd.DataFrame(fold)
    df_result = pd.DataFrame(results)
    return df_result

In [17]:
df_res = run(df)


_____1_____
True
(107, 21)
True: 3
False
(310, 21)
False: 4
_____2_____
True
(107, 21)
True: 3
False
(310, 21)
False: 4
_____3_____
True
(107, 21)
True: 3
False
(311, 21)
False: 3
_____4_____
True
(104, 21)
True: 3
False
(314, 21)
False: 4
_____5_____
True
(3, 21)
True: 3
False
(415, 21)
False: 4


In [18]:
df_res

Unnamed: 0,algoritmo,media,std,media harmonica
0,1-NN,80.46337,4.818864,80.168203
1,NB (sem KMeans),83.708791,4.996591,83.385496
2,NB clean(com KMeans),81.985348,3.159855,81.860995
