In [1]:
import numpy as np
import pandas as pd

import io
import urllib.request
from scipy.io import arff

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold

from sklearn.metrics import confusion_matrix


In [2]:
url = 'http://promise.site.uottawa.ca/SERepository/datasets/kc2.arff'
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
df = pd.DataFrame(data)
df = df.sample(frac=1, random_state=20)

#Transforming in boolean class
df['problems'] = df['problems'].apply(str).str.replace("b|'", '')
df['problems'] = df['problems'].apply(lambda x: 1 if x == 'no' else -1)

#Normalizing values
df

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,lOCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,problems
95,8.0,1.0,1.0,1.0,1.0,0.00,0.00,0.00,0.00,0.00,...,1.0,0.0,0.0,0.0,4.0,4.0,4.0,6.0,1.0,1
56,26.0,4.0,1.0,3.0,53.0,249.12,0.11,9.00,27.68,2242.11,...,19.0,0.0,3.0,0.0,13.0,13.0,35.0,18.0,7.0,1
121,8.0,1.0,1.0,1.0,10.0,30.00,0.30,3.33,9.00,100.00,...,4.0,0.0,1.0,0.0,5.0,3.0,6.0,4.0,1.0,1
423,130.0,14.0,6.0,11.0,336.0,1951.27,0.05,22.00,88.69,42927.97,...,104.0,4.0,18.0,2.0,14.0,42.0,204.0,132.0,27.0,-1
29,4.0,1.0,1.0,1.0,6.0,15.51,0.40,2.50,6.20,38.77,...,2.0,0.0,0.0,0.0,5.0,1.0,5.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,4.0,1.0,1.0,1.0,4.0,8.00,0.67,1.50,5.33,12.00,...,2.0,0.0,0.0,0.0,3.0,1.0,3.0,1.0,1.0,1
223,6.0,3.0,1.0,1.0,24.0,88.81,0.21,4.71,18.84,418.68,...,4.0,0.0,0.0,0.0,6.0,7.0,13.0,11.0,5.0,1
271,4.0,1.0,1.0,1.0,5.0,11.61,0.67,1.50,7.74,17.41,...,2.0,0.0,0.0,0.0,3.0,2.0,3.0,2.0,1.0,1
474,104.0,17.0,5.0,11.0,356.0,2159.53,0.04,22.27,96.95,48102.42,...,86.0,5.0,11.0,0.0,16.0,51.0,214.0,142.0,33.0,-1


In [3]:
def attr_class(df):
    Y_df = df['problems']
    X_df = df.drop(columns=['problems'])
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    x_scaled = scaler.fit_transform(X_df)
    X_df = pd.DataFrame(x_scaled)
    return X_df, Y_df

In [8]:
class OneClassNB:
    
    def fit(self, X):
        n_samples, n_features = X.shape
        self._classes = 1
       
        self._mean = X.mean(axis=0)
        self._var  = X.var(axis=0)
        
    def predict(self, X):
        y_pred = []
        for _, row in X.iterrows():
            y = self._predict(row)
            y_pred.append(y)
        return np.array(y_pred)        
    
    def _predict(self, x):
        proba = np.sum(np.log(self._pdf(x)))
        return 1 if proba > 0 else -1
    
    def _pdf(self, x):
        mean = self._mean
        var  = self._var
        numerator   = np.exp(- (x-mean)**2 / (2* var))
        denominator = np.sqrt(2 * np.pi * var)
        return (numerator/denominator)
        

In [9]:
def _precision(tp, fp):
    return tp/(tp + fp)

def _recall(tp, p):
    return tp/p

def _fmeasure(tp, fp, p):
    precision   = _precision(tp, fp)
    recall      = _recall(tp, p)
    denominator = (1/precision) + (1/recall)
    return 2/ denominator

def _accuracy(predicted, real):
    (np.sum(predicted == real) / len(real)) * 100

In [16]:
def run(df):
    kf = KFold(n_splits=5)
    tamanhos = [0.3, 0.4, 0.5]
    results = []
    
    for tamanho in tamanhos:
        accuracies     = []
        true_positive  = []
        false_positive = []
        f1_measure     = []
        tp_arr = []
        fp_arr = []
        tn_arr = []
        fn_arr = []
        idx = 1
        for train, test in kf.split(df):
            
            _train = int(len(train) * tamanho)
            _test = len(train) - _train
            
            df_res   = df.iloc[train]
            
            df_bug = df_res[df_res['problems'] == -1]
            df_res = df_res[df_res['problems'] == 1]

            df_train = df_res.head(_train)
            df_valid = df_res.tail(_test)
            df_test  = df.iloc[test]   
            
            
            df_train_full = pd.concat([df_valid, df_test, df_bug])
            
            X_train, Y_train = attr_class(df_train)
            X_test, Y_test = attr_class(df_train_full)
            

            clf = OneClassNB()
            clf.fit(X_train)
            predictions = clf.predict(X_test)
            
            tn, fp, fn, tp = confusion_matrix(Y_test, predictions).ravel()
            
            p = len(Y_test[Y_test == 1])
            n = len(Y_test[Y_test == -1])
            
            tp_rate = tp/p
            fp_rate = fp/n
            true_positive.append(tp_rate)
            false_positive.append(fp_rate)
            
            tp_arr.append(tp)
            fp_arr.append(fp)
            tn_arr.append(tn)
            fn_arr.append(fn)
            
            f1 = _fmeasure(tp, fp, p)
            f1_measure.append(f1)
            
            acc = (np.sum(predictions == Y_test) / len(Y_test))
            accuracies.append(acc)
            idx += 1
            
            
        temp= {
            'train size': "{}%".format(tamanho*100),
            'accuracy': np.mean(accuracies) * 100,
            'tp rate': np.mean(true_positive) * 100,
            'fp rate': np.mean(false_positive) * 100,
            'f1 measure': np.mean(f1_measure) * 100,
            'tp sum': np.sum(tp_arr),
            'tn sum': np.sum(tn_arr),
            'fp sum': np.sum(fp_arr),
            'fn sum': np.sum(fn_arr),
        }

        results.append(temp)
    return pd.DataFrame(results)

result = run(df)
result

Unnamed: 0,train size,accuracy,tp rate,fp rate,f1 measure,tp sum,tn sum,fp sum,fn sum
0,30.0%,78.822162,99.893474,95.140187,88.012444,1876,26,509,2
1,40.0%,76.82427,99.641045,94.392523,86.688374,1664,30,505,6
2,50.0%,74.283774,99.520463,94.579439,84.994072,1453,29,506,7
