In [487]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from Scripts.dirty_accuracy import injection
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

## Helper functions

Naive Bayes Classification implementation

In [488]:
def NBClassification(X_train, y_train, X_test, y_test):
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    training_score = nb.score(X_train, y_train)
    test_score = nb.score(X_test, y_test)
    return [training_score, test_score]

Decision Tree Classification implementation

In [489]:
def DTClassification(X_train, y_train, X_test, y_test):
    dtc = DecisionTreeClassifier()
    dtc.fit(X_train, y_train)
    training_score = dtc.score(X_train, y_train)
    test_score = dtc.score(X_test, y_test)
    return [training_score, test_score]

Data Classification implementation that turns all categorical columns into numerical values

In [490]:
def DCmap(df):
    dataset = df.copy()
    for col in dataset:
        if dataset[col].dtype == object:
            to_replace = dataset[col].unique()
            value = []
            for i in range (len(to_replace)):
                value.append(i)
            dataset[col] = dataset[col].replace(to_replace=to_replace, value=value)
    return dataset

Print scores

In [491]:
def printScores(scores):
    for score in scores:
        print("Dataset dirty at: ", score['dirty'], "%")
        print(score['scores'][0],':', score['scores'][1])

IRQ detection and correction

In [492]:
def IRQ(data):
    Q1, Q3 = np.percentile(data, [25, 75])
    IQR = Q3 - Q1
    lower_range = Q1 - (1.5 * IQR)
    upper_range = Q3 + (1.5 * IQR)
    outliers = data[((data < lower_range) | (data > upper_range))]
    for i in range(len(data)):
        if data[i] in outliers:
            data[i] = (lower_range + upper_range)/2
    return data

DBSCAN detection and correction

In [493]:
from scipy.spatial.distance import cdist


def KMeansDetectionColumn(data, col):
    km = KMeans(n_clusters=2)
    clusters = km.fit_predict(data)
    centroids = km.cluster_centers_
    points = np.empty((0,len(data[col])), float)
    distances = np.empty((0,len(data[col])), float)
    for i, center_elem in enumerate(centroids):
        distances = np.append(distances, cdist([center_elem],data[clusters == i], 'euclidean'))
        points = np.append(points, data[clusters == i], axis=0)
    percentile = 80
    outliers = points[np.where(distances > np.percentile(distances, percentile))]
    print(outliers)
    return data[col]

In [494]:
def KMeansDetection(data):
    columns = ["Sex","Housing","Saving_accounts","Checking_account","Purpose"]
    for col in data.columns:
        if col not in columns:
            columns.append(col)
            data[col] = KMeansDetectionColumn(data[columns], col)
    return data

## Pipeline implementation

Read the dataset from CSV file

In [495]:
german = pd.read_csv("datasets/german.csv", sep=',')
ss = StandardScaler()
literal_cols = ["Sex","Housing","Saving_accounts","Checking_account","Purpose", "Risk"]
numerical_cols = ["Age", "Job", "Credit_amount", "Duration"]
german_str = german[literal_cols]
german_num = german[numerical_cols]
german_num = pd.DataFrame(ss.fit_transform(german_num), columns=numerical_cols)
german = pd.concat([german_str, german_num], axis=1)

Perform the injections of outliers
and append the original clean dataset to the list

In [496]:
german_list = injection(df_pandas=german, seed=10, name='german', name_class='Risk')
german_list.append(german)
for i in range(len(german_list)):
    german_list[i] = DCmap(german_list[i])

saved german-accuracy50%
saved german-accuracy60%
saved german-accuracy70%
saved german-accuracy80%
saved german-accuracy90%


Reverse the list on order to have
0 - Original Dataset
1 - 10% Dirty dataset
2 - 20% Dirty dataset
3 - 30% Dirty dataset
4 - 40% Dirty dataset
5 - 50% Dirty dataset

In [497]:
german_list.reverse()

### Functions to be repeated for each dataframe

In [498]:
def computeScoresDT(dataframe_array):
    y = german['Risk']
    scores = []
    i = 0
    for df in dataframe_array:
        X = df.drop('Risk', axis=1, errors='ignore')
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
        sc = DTClassification(X_train, y_train, X_test, y_test)
        scores.append({'dirty': i, 'scores': sc})
        i = i + 10
    return scores

In [499]:
def computeScoresNB(dataframe_array):
    y = german['Risk']
    scores = []
    i = 0
    for df in dataframe_array:
        X = df.drop('Risk', axis=1, errors='ignore')
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
        sc = NBClassification(X_train, y_train, X_test, y_test)
        scores.append({'dirty': i, 'scores': sc})
        i = i + 10
    return scores

In [500]:
def cleanDatasetsIRQ(datasets):
    cleaned_list = []
    for df in datasets:
        cleaned = df.copy()
        for col in cleaned.columns:
            if col not in ["Sex","Housing","Saving_accounts","Checking_account","Purpose", "Risk"]:
                cleaned[col] = IRQ(cleaned[col].values)
        cleaned_list.append(cleaned)
    return cleaned_list

In [501]:
def cleanDatasetKMeans(datasets):
    cleaned_list = []
    for df in datasets:
        cleaned = df.copy()
        cleaned = cleaned.drop('Risk', axis=1)
        cleaned = KMeansDetection(cleaned)
        cleaned_list.append(cleaned)
    return cleaned_list

Compute scores

In [502]:
DTscores = computeScoresDT(german_list)
printScores(DTscores)

Dataset dirty at:  0 %
1.0 : 0.6466666666666666
Dataset dirty at:  10 %
1.0 : 0.6533333333333333
Dataset dirty at:  20 %
1.0 : 0.6
Dataset dirty at:  30 %
1.0 : 0.58
Dataset dirty at:  40 %
1.0 : 0.5966666666666667
Dataset dirty at:  50 %
1.0 : 0.6066666666666667


In [503]:
NBscore = computeScoresNB(german_list)
printScores(NBscore)

Dataset dirty at:  0 %
0.6885714285714286 : 0.7266666666666667
Dataset dirty at:  10 %
0.6657142857142857 : 0.6633333333333333
Dataset dirty at:  20 %
0.6914285714285714 : 0.6566666666666666
Dataset dirty at:  30 %
0.6814285714285714 : 0.6366666666666667
Dataset dirty at:  40 %
0.6942857142857143 : 0.67
Dataset dirty at:  50 %
0.6828571428571428 : 0.6666666666666666


Find column by column outliers using ZScore and DBSCAN and replace them with mean value

In [504]:
cleaned_zs_lists = cleanDatasetsIRQ(german_list)
cleaned_km_list = cleanDatasetKMeans(german_list)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 1000 and the array at index 1 has size 6

Perform again the classification after outliers detection and print results

In [None]:
cleaned_DTscores_zs = computeScoresDT(cleaned_zs_lists)
cleaned_NBscores_zs = computeScoresNB(cleaned_zs_lists)
cleaned_DTscores_km = computeScoresDT(cleaned_km_list)
cleaned_NBscores_km = computeScoresNB(cleaned_km_list)

In [None]:
printScores(cleaned_DTscores_zs)

In [None]:
printScores(cleaned_NBscores_zs)

In [None]:
printScores(cleaned_DTscores_km)

In [None]:
printScores(cleaned_NBscores_km)

### Plot data preparation

Transform the score object array into an array of test scores

In [None]:
def toArray(score):
    t = []
    for s in score:
        t.append(s['scores'][1])
    return t

Preparation of plot DataFrame

In [None]:
plotData = {
    'dirty': [0, 10, 20, 30, 40, 50],
    'DTc_pre': toArray(DTscores),
    'NB_pre': toArray(NBscore),
    'DTc_zs_post': toArray(cleaned_DTscores_zs),
    'NB_zs_post': toArray(cleaned_NBscores_zs),
    'DTc_km_post': toArray(cleaned_DTscores_km),
    'NB_km_post': toArray(cleaned_NBscores_km)
}
plotDF = pd.DataFrame(plotData)

#### Comparison between pre and post detection scores for each combination of algorithms

In [None]:
sns.set()
fig, axes = plt.subplots(4, 1, sharex=False, figsize=(7,14))

sns.lineplot(ax=axes[0], x='dirty', y='DTc_pre', data=plotDF)
sns.lineplot(ax=axes[0], x='dirty', y='DTc_zs_post', data=plotDF)
axes[0].legend(['Pre detection', 'Post detection'])
axes[0].set(xlabel='Dirty percentage', ylabel='Performance')
axes[0].set_ylim(0.2,1.1)
axes[0].set_title('DTC + ZScore')

sns.lineplot(ax=axes[1], x='dirty', y='NB_pre', data=plotDF)
sns.lineplot(ax=axes[1], x='dirty', y='NB_zs_post', data=plotDF)
axes[1].legend(['Pre detection', 'Post detection'])
axes[1].set(xlabel='Dirty percentage', ylabel='Performance')
axes[1].set_ylim(0.2,1.1)
axes[1].set_title('NB + ZScore')

sns.lineplot(ax=axes[2], x='dirty', y='DTc_pre', data=plotDF)
sns.lineplot(ax=axes[2], x='dirty', y='DTc_km_post', data=plotDF)
axes[2].legend(['Pre detection', 'Post detection'])
axes[2].set(xlabel='Dirty percentage', ylabel='Performance')
axes[2].set_ylim(0.2,1.1)
axes[2].set_title('DTC + K-Means')

sns.lineplot(ax=axes[3], x='dirty', y='NB_pre', data=plotDF)
sns.lineplot(ax=axes[3], x='dirty', y='NB_km_post', data=plotDF)
axes[3].legend(['Pre detection', 'Post detection'])
axes[3].set(xlabel='Dirty percentage', ylabel='Performance')
axes[3].set_ylim(0.2,1.1)
axes[3].set_title('NB + K-Means')
fig.tight_layout()

#### Comparison between the two classification algorithms pre-detection and post-detection (for each outlier detection algorithm)

In [None]:
sns.set()
fig, axes = plt.subplots(3, 1, sharex=False, figsize=(7,18))

sns.lineplot(ax=axes[0], x='dirty', y='DTc_pre', data=plotDF)
sns.lineplot(ax=axes[0], x='dirty', y='NB_pre', data=plotDF)
axes[0].legend(['Pre detection DTC', 'Pre detection NB'])
axes[0].set(xlabel='Dirty percentage', ylabel='Performance')
axes[0].set_ylim(0.2,1.1)
axes[0].set_title('Pre detection performances')

sns.lineplot(ax=axes[1], x='dirty', y='DTc_zs_post', data=plotDF)
sns.lineplot(ax=axes[1], x='dirty', y='NB_zs_post', data=plotDF)
axes[1].legend(['Post detection DTC', 'Post detection NB'])
axes[1].set(xlabel='Dirty percentage', ylabel='Performance')
axes[1].set_ylim(0.2,1.1)
axes[1].set_title('Post detection (ZS) performances')

sns.lineplot(ax=axes[2], x='dirty', y='DTc_km_post', data=plotDF)
sns.lineplot(ax=axes[2], x='dirty', y='NB_km_post', data=plotDF)
axes[2].legend(['Post detection DTC', 'Post detection NB'])
axes[2].set(xlabel='Dirty percentage', ylabel='Performance')
axes[2].set_ylim(0.2,1.1)
axes[2].set_title('Post detection K-Means performances')

#### Comparison between detection algorithms for each post-detection one

In [None]:
sns.set()
fig, axes = plt.subplots(2, 1, sharex=False, figsize=(7,18))

sns.lineplot(ax=axes[0], x='dirty', y='DTc_zs_post', data=plotDF)
sns.lineplot(ax=axes[0], x='dirty', y='DTc_km_post', data=plotDF)
axes[0].legend(['Z-Score', 'K-Means'])
axes[0].set(xlabel='Dirty percentage', ylabel='Performance')
axes[0].set_ylim(0.2,1.1)
axes[0].set_title('DTC Post-detection performances')

sns.lineplot(ax=axes[1], x='dirty', y='NB_zs_post', data=plotDF)
sns.lineplot(ax=axes[1], x='dirty', y='NB_km_post', data=plotDF)
axes[1].legend(['Z-Score', 'K-Means'])
axes[1].set(xlabel='Dirty percentage', ylabel='Performance')
axes[1].set_ylim(0.2,1.1)
axes[1].set_title('NB Post-detection performances')