In [66]:
import numpy as np
import pandas as pd

import math
import io
import urllib.request
from scipy.io import arff

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import confusion_matrix


import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 

In [67]:
url = 'http://promise.site.uottawa.ca/SERepository/datasets/kc2.arff'
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
df = pd.DataFrame(data)
column = 'problems'

#Transforming in boolean class
df[column] = df[column].apply(str).str.replace("b|'", '')
df[column] = df[column].apply(lambda x: 1 if x == 'yes' else 0)

#Normalizing values
df

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,lOCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,problems
0,1.1,1.4,1.4,1.4,1.3,1.30,1.30,1.30,1.30,1.30,...,2.0,2.0,2.0,2.0,1.2,1.2,1.2,1.2,1.4,0
1,1.0,1.0,1.0,1.0,1.0,1.00,1.00,1.00,1.00,1.00,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
2,415.0,59.0,50.0,51.0,1159.0,8411.31,0.01,103.53,81.24,870848.58,...,359.0,35.0,9.0,10.0,47.0,106.0,692.0,467.0,106.0,1
3,230.0,33.0,10.0,16.0,575.0,3732.82,0.03,39.82,93.74,148644.06,...,174.0,15.0,34.0,5.0,23.0,67.0,343.0,232.0,65.0,1
4,175.0,26.0,12.0,13.0,500.0,3123.96,0.03,29.48,105.96,92103.07,...,142.0,7.0,19.0,4.0,18.0,58.0,310.0,190.0,51.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,4.0,1.0,1.0,1.0,5.0,11.61,0.50,2.00,5.80,23.22,...,2.0,0.0,0.0,0.0,4.0,1.0,4.0,1.0,1.0,1
518,4.0,1.0,1.0,1.0,4.0,8.00,0.67,1.50,5.33,12.00,...,2.0,0.0,0.0,0.0,3.0,1.0,3.0,1.0,1.0,1
519,4.0,1.0,1.0,1.0,4.0,8.00,0.67,1.50,5.33,12.00,...,2.0,0.0,0.0,0.0,3.0,1.0,3.0,1.0,1.0,1
520,4.0,1.0,1.0,1.0,5.0,11.61,0.67,1.50,7.74,17.41,...,2.0,0.0,0.0,0.0,3.0,2.0,3.0,2.0,1.0,1


In [68]:
df_true = df.loc[df[column] == True]
df_true =  df_true.drop(columns=column)

df_false = df.loc[df[column]== False]
df_false = df_false.drop(columns=column)


In [69]:
import plotly.express as px

def plot(wcss, title):
    
    df = pd.DataFrame(wcss, columns=['dist'])
    df['clusters'] = [2,3,4,5,6]
    fig = px.line(df, x = 'clusters', y='dist', title = title)
    fig.show()


def silhouette(kmeans, df):
    labels = kmeans.labels_
    score = silhouette_score(df, labels, metric='euclidean')
    return score

In [70]:
result  = []
elbow_true = []
elbow_false = []


for k in [2,3,4,5,6]:
    
    
    kmeans = KMeans(n_clusters=k, random_state=0)
    predict_true = kmeans.fit_predict(df_true)
    elb_true = kmeans.inertia_
    elbow_true.append(elb_true)
    sil_true = silhouette(kmeans, df_true)
    

    kmeans = KMeans(n_clusters=k, random_state=20)
    predict_false = kmeans.fit_predict(df_false)
    elb_false = kmeans.inertia_
    elbow_false.append(elb_false)
    sil_false = silhouette(kmeans, df_false)
    
    result.append({ 'k': k,
                    'silhouette_true': sil_true,
                    'silhouette_false': sil_false})
    
    df_false['meta_class'] = predict_false




In [71]:
plot(elbow_true, "elbow - True")

plot(elbow_false, "elbow - False")

In [72]:
result_df = pd.DataFrame(result)
result_df

Unnamed: 0,k,silhouette_true,silhouette_false
0,2,0.940079,0.932693
1,3,0.923942,0.865007
2,4,0.739896,0.851099
3,5,0.742481,0.805772
4,6,0.675257,0.80562


In [73]:
kmeans = KMeans(n_clusters=3, random_state=0)
predict_true = kmeans.fit_predict(df_true)
df_true['meta_class'] = predict_true

kmeans = KMeans(n_clusters=2, random_state=0)
predict_false = kmeans.fit_predict(df_false)
df_false['meta_class'] = predict_false

In [74]:
df_false['meta_class'] = "f_" + df_false['meta_class'].astype(str)
df_true['meta_class'] = "t_" + df_true['meta_class'].astype(str)


In [75]:
df_res = pd.concat([df_false, df_true])

df_res = df_res.sample(frac=1, random_state=20)
df = df.sample(frac=1, random_state=20)


In [76]:
df_res['meta_class'].unique()

array(['f_0', 't_0', 'f_1', 't_1', 't_2'], dtype=object)

In [77]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [78]:
def attr_class(df, column):
    Y_df = df[column]
    X_df = df.drop(columns=[column])
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    x_scaled = scaler.fit_transform(X_df)
    X_df = pd.DataFrame(x_scaled)
    return X_df, Y_df


def accuracy(predicted, real):
    return (np.sum(predicted == real) / len(real)) * 100


In [79]:
knn_acc = []
nb_acc  = []
nbk_acc = []


kf = KFold(n_splits=5)

for train, test in kf.split(df_res):
    X, Y = attr_class(df, column)
    
    knn = KNeighborsClassifier(n_neighbors=1)
    knn.fit(X.iloc[train], Y.iloc[train])
    knn_pred = knn.predict(X.iloc[test])
    acc = accuracy(knn_pred, Y.iloc[test])
    knn_acc.append(acc)
    
    nb_normal = GaussianNB()
    nb_normal.fit(X.iloc[train], Y.iloc[train])
    nb_pred = nb_normal.predict(X.iloc[test])
    acc = accuracy(nb_pred, Y.iloc[test])
    nb_acc.append(acc)
    
    X, Y = attr_class(df_res, "meta_class")    
    nb_k = GaussianNB()
    nb_k.fit(X.iloc[train], Y.iloc[train])
    nbk_pred = nb_k.predict(X.iloc[test])
    acc = accuracy(nbk_pred, Y.iloc[test])
    nbk_acc.append(acc)
    
    

results = []

results.append({
    "algoritmo": "1-NN",
    "acuracia":  np.mean(knn_acc)
})

results.append({
    "algoritmo": "NB (sem KMeans)",
    "acuracia":  np.mean(nb_acc)
})

results.append({
    "algoritmo": "NB (com KMeans)",
    "acuracia":   np.mean(nbk_acc)
})


df_result = pd.DataFrame(results)
df_result

Unnamed: 0,algoritmo,acuracia
0,1-NN,79.893773
1,NB (sem KMeans),83.335165
2,NB (com KMeans),79.501832
