In [1]:
import numpy as np
import pandas as pd

import io
import urllib.request
from scipy.io import arff

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold



In [None]:
url = 'http://promise.site.uottawa.ca/SERepository/datasets/kc1.arff'
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
df = pd.DataFrame(data)
df = df.sample(frac=1, random_state=20)

#Transforming in boolean class
df['defects'] = df['defects'].apply(str).str.replace("b|'", '')
df['defects'] = df['defects'].apply(lambda x: 1 if x == 'false' else -1)

#Normalizing values
df

In [None]:
def attr_class(df):
    Y_df = df['defects']
    X_df = df.drop(columns=['defects'])
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    x_scaled = scaler.fit_transform(X_df)
    X_df = pd.DataFrame(x_scaled)
    return X_df, Y_df

In [None]:
class NaiveBayes:
    def __init__(self, nu=0.01):
        self.nu = nu
    
    def fit(self, X):
        n_samples, n_features = X.shape
        self._classes = 1
        n_classes = 1
        
        self._mean = np.zeros(n_features, dtype=np.float64)
        self._var  = np.zeros(n_features, dtype=np.float64)
        
        self._mean = X.mean(axis=0)
        self._var  = X.var(axis=0)
        self._priors = self.nu 
        
    def predict(self, X):
        y_pred = []
        for _, row in X.iterrows():
            y = self._predict(row)
            y_pred.append(y)
        return np.array(y_pred)
    
    def _predict(self, x):
        prior = np.log(self._priors)
        posterior = np.sum(np.log(self._pdf(x)))
        posterior = prior + posterior
        return 1 if posterior > 0 else -1
    
    def _pdf(self, x):
        mean = self._mean[1]
        var  = self._var[1]
        numerator   = np.exp(- (x-mean)**2 / (2* var))
        denominator = np.sqrt(2 * np.pi * var)
        return (numerator/denominator)
        

In [None]:
def run(df):
    kf = KFold(n_splits=5)
    tamanhos = [0.3, 0.4, 0.5]
    nu = len(df[df['defects']== 1]) / len(df)
    for tamanho in tamanhos:
        accuracies = []
        for train, test in kf.split(df):
            
            _train = int(len(train) * tamanho)
            _test = len(train) - _train
            
            df_res   = df.iloc[train]
            df_train = df_res.head(_train)
            df_valid = df_res.tail(_test)
            df_test  = df.iloc[test]   
            
            df_train = df_train[df_train['defects'] == 1]
            
            df_train_full = pd.concat([df_valid, df_test])
            
            X_train, Y_train = attr_class(df_train)
            X_valid, Y_valid = attr_class(df_valid)
            X_test, Y_test = attr_class(df_train_full)
            

            clf = NaiveBayes(nu)
            clf.fit(X_train)
            predictions = clf.predict(X_test)
            acc = (np.sum(predictions == Y_test) / len(df_train_full)) * 100
            accuracies.append(acc)
        print('ruido: {}%'.format(tamanho))
        print(df_train.shape)
        print(np.mean(accuracies))
        
run(df)

ruido: 0.3%
(96, 22)
80.45340050377834
ruido: 0.4%
(129, 22)
80.92324734926413
ruido: 0.5%
(166, 22)
80.91756374514152