In [1]:
import numpy as np
import pandas as pd

import io
import urllib.request
from scipy.io import arff

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold

from sklearn.metrics import confusion_matrix


In [2]:
url = 'http://promise.site.uottawa.ca/SERepository/datasets/kc1.arff'
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
df = pd.DataFrame(data)
df = df.sample(frac=1, random_state=20)

#Transforming in boolean class
df['defects'] = df['defects'].apply(str).str.replace("b|'", '')
df['defects'] = df['defects'].apply(lambda x: 1 if x == 'false' else -1)

#Normalizing values
df

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
590,22.0,2.0,1.0,2.0,56.0,253.32,0.14,7.07,35.82,1791.33,...,16.0,1.0,2.0,0.0,9.0,14.0,34.0,22.0,3.0,1
307,23.0,1.0,1.0,1.0,46.0,218.72,0.32,3.14,69.59,687.42,...,15.0,0.0,2.0,0.0,6.0,21.0,24.0,22.0,1.0,-1
993,5.0,1.0,1.0,1.0,4.0,8.00,1.00,1.00,8.00,8.00,...,2.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,1.0,1
1838,17.0,2.0,1.0,2.0,37.0,159.91,0.17,6.00,26.65,959.47,...,14.0,0.0,0.0,0.0,10.0,10.0,25.0,12.0,3.0,1
362,31.0,2.0,1.0,2.0,104.0,529.10,0.13,7.74,68.36,4095.20,...,21.0,1.0,4.0,0.0,9.0,25.0,61.0,43.0,3.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,60.0,11.0,5.0,11.0,155.0,795.04,0.05,18.38,43.27,14608.84,...,50.0,4.0,4.0,0.0,15.0,20.0,106.0,49.0,21.0,1
1607,1.0,1.0,1.0,1.0,4.0,8.00,0.67,1.50,5.33,12.00,...,0.0,0.0,0.0,0.0,3.0,1.0,3.0,1.0,1.0,1
1814,4.0,1.0,1.0,1.0,5.0,11.61,0.50,2.00,5.80,23.22,...,2.0,0.0,0.0,0.0,4.0,1.0,4.0,1.0,1.0,1
1428,11.0,1.0,1.0,1.0,18.0,57.06,0.27,3.75,15.22,213.97,...,6.0,3.0,0.0,0.0,5.0,4.0,12.0,6.0,1.0,1


In [3]:
def attr_class(df):
    Y_df = df['defects']
    X_df = df.drop(columns=['defects'])
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    x_scaled = scaler.fit_transform(X_df)
    X_df = pd.DataFrame(x_scaled)
    return X_df, Y_df

In [4]:
class OneClassNB:
    
    def fit(self, X):
        n_samples, n_features = X.shape
        self._classes = 1
       
        self._mean = X.mean(axis=0)
        self._var  = X.var(axis=0)
        
    def predict(self, X):
        y_pred = []
        for _, row in X.iterrows():
            y = self._predict(row)
            y_pred.append(y)
        return np.array(y_pred)
    
    def _predict(self, x):
        proba = np.sum(np.log(self._pdf(x)))
        return 1 if proba > 0 else -1
    
    def _pdf(self, x):
        mean = self._mean
        var  = self._var
        numerator   = np.exp(- (x-mean)**2 / (2* var))
        denominator = np.sqrt(2 * np.pi * var)
        return (numerator/denominator)
        

In [5]:

def _precision(tp, fp):
    return tp/(tp + fp)

def _recall(tp, p):
    return tp/p

def _fmeasure(tp, fp, p):
    precision   = _precision(tp, fp)
    recall      = _recall(tp, p)
    denominator = (1/precision) + (1/recall)
    return 2/ denominator

def _accuracy(predicted, real):
    (np.sum(predicted == real) / len(real)) * 100

In [6]:
def run(df):
    kf = KFold(n_splits=5)
    tamanhos = [0.3, 0.4, 0.5]
    results = []
    
    for tamanho in tamanhos:
        accuracies     = []
        true_positive  = []
        false_positive = []
        f1_measure     = []
        tp_arr = []
        fp_arr = []
        tn_arr = []
        fn_arr = []
        idx = 1
        for train, test in kf.split(df):
            
            _train = int(len(train) * tamanho)
            _test = len(train) - _train
            
            df_res   = df.iloc[train]
            df_train = df_res.head(_train)
            df_valid = df_res.tail(_test)
            df_test  = df.iloc[test]   
            
            df_train = df_train[df_train['defects'] == 1]
            
            df_train_full = pd.concat([df_valid, df_test])
            
            X_train, Y_train = attr_class(df_train)
            X_test, Y_test = attr_class(df_train_full)
            

            clf = OneClassNB()
            clf.fit(X_train)
            predictions = clf.predict(X_test)
            
            tn, fp, fn, tp = confusion_matrix(Y_test, predictions).ravel()
            
            p = len(Y_test[Y_test == 1])
            n = len(Y_test[Y_test == -1])
            
            tp_rate = tp/p
            fp_rate = fp/n
            true_positive.append(tp_rate)
            false_positive.append(fp_rate)
            
            tp_arr.append(tp)
            fp_arr.append(fp)
            tn_arr.append(tn)
            fn_arr.append(fn)
            
            f1 = _fmeasure(tp, fp, p)
            f1_measure.append(f1)
            
            acc = (np.sum(predictions == Y_test) / len(Y_test))
            accuracies.append(acc)
            idx += 1
            
            
        temp= {
            'train size': "{}%".format(tamanho*100),
            'accuracy': np.mean(accuracies) * 100,
            'tp rate': np.mean(true_positive) * 100,
            'fp rate': np.mean(false_positive) * 100,
            'f1 measure': np.mean(f1_measure) * 100,
            'tp sum': np.sum(tp_arr),
            'tn sum': np.sum(tn_arr),
            'fp sum': np.sum(fp_arr),
            'fn sum': np.sum(fn_arr),
        }

        results.append(temp)
    return pd.DataFrame(results)

result = run(df)
result

Unnamed: 0,train size,accuracy,tp rate,fp rate,f1 measure,tp sum,tn sum,fp sum,fn sum
0,30.0%,85.077979,97.938323,86.275255,91.750438,6651,168,1056,140
1,40.0%,85.280208,97.634989,84.134747,91.843824,5946,172,912,144
2,50.0%,85.005626,97.570004,84.086738,91.672355,5225,155,819,130
