In [1]:
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import math 
import matplotlib.pyplot as plt
import pandas as pd

#%pip install seaborn
import seaborn as sns


In [2]:
def non_null_count(df: pd.DataFrame):

    if df.isnull().sum().sum() != 0:
        na_df = 100-(df.isnull().sum() / len(df)) * 100      
        na_df = na_df.drop(na_df[na_df == 0].index).sort_values(ascending=False)
        missing_data = pd.DataFrame({'Missing Ratio %' :na_df})
        missing_data.plot(kind = "barh", figsize=(10,40))
        plt.show()
        print(missing_data.to_markdown())
    else:
        print('No data found')


In [3]:
def field_analysis_float(buckets_count, df_dt, baseField, tmpFieldName, exclude_percentile_offset=-1):

    if baseField == tmpFieldName:
        return

    plt.figure(figsize=(20, 10))

    if exclude_percentile_offset > 0:
        p1 = np.percentile(df_dt[tmpFieldName], exclude_percentile_offset, method='midpoint')        
        p99 = np.percentile(df_dt[tmpFieldName], 100-exclude_percentile_offset, method='midpoint')
        print('Range excluding outliers (',100-2*exclude_percentile_offset,'%):',p1,'->',p99)
        localvalues = df_dt[(df_dt[tmpFieldName]>p1) & (df_dt[tmpFieldName]<p99)][tmpFieldName]
        
        if len(localvalues) == 0:
            print("Error Calculating Outliers, rollback to full data")
            localvalues = df_dt[tmpFieldName]
    else:   
        localvalues = df_dt[tmpFieldName]


    print("---------------------------------------------------------")
    print("CORRELATION: ", tmpFieldName, df_dt[baseField].corr(df_dt[tmpFieldName]))

    bins_tmp = np.linspace(df_dt[tmpFieldName].min(), df_dt[tmpFieldName].max(), buckets_count)
    bins_tmp = np.around(bins_tmp, decimals=2)


    f_hist = np.histogram(df_dt[tmpFieldName], bins=bins_tmp)
    #print(f_hist)
    
    plt.subplot(1, 2, 1)
    plt.xticks(rotation=90, fontsize=5)
    plt.bar(x=f_hist[1][1:].astype(str), height=f_hist[0])

    p_hist = np.histogram(df_dt[df_dt[baseField] == 1][tmpFieldName], bins=bins_tmp)
    #print(p_hist)
    p_factor = p_hist[0]/f_hist[0]

    plt.subplot(1, 2, 2)
    plt.scatter(f_hist[1][1:], p_factor, marker = ".")

    plt.show()

    df_table = pd.DataFrame({"upper": f_hist[1][1:], "total":f_hist[0], "posi": p_hist[0], "%": p_factor})

    np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
    print(df_table.to_markdown())


In [4]:
def field_analysis_category(df_dt, baseField, tmpFieldName):
    counts_t = df_dt[tmpFieldName].value_counts()
    if len(counts_t) == 0:
        return

    if True:
        plt.subplot(1, 2, 1)
        counts_t.plot(kind='bar')

        corr = df_dt[[baseField, tmpFieldName]].apply(lambda x : pd.factorize(x)[0]).corr(method='pearson', min_periods=1)
        print("---------------------------------------------------------")
        print("CORRELATION: ", corr)
        print("Categories Count: ", counts_t)

        cat_list = df_dt[tmpFieldName].dropna().unique()
        cat_rate = []
        for u_cat in cat_list:
            cat_t = len(df_dt[df_dt[tmpFieldName] == u_cat])
            cat_p = len( df_dt[ (df_dt[tmpFieldName] == u_cat) & (df_dt[baseField] == 1) ] )
            cat_rate.append(cat_p/cat_t)

        #print(cat_list.astype(str))
        #print(len(cat_rate))
        #print(cat_rate)

        plt.subplot(1, 2, 2)
        plt.xticks(rotation=90)
        plt.scatter(cat_list.astype(str), cat_rate, marker = ".")
        plt.show()

        np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
        print(np.array((cat_list, counts_t, cat_rate)).T)

In [5]:
def categories_unique_count(df_dt):
    categories_df = df_dt.select_dtypes(include=["category"])
    result = categories_df.apply(pd.Series.nunique).sort_values(ascending=False) #unique
    print(result.to_markdown())

In [6]:
def field_analysis_integer(buckets_count, df_dt, baseField, tmpFieldName, exclude_percentile_offset=-1):

    if baseField == tmpFieldName:
        return

    plt.figure(figsize=(20, 10))

    if exclude_percentile_offset > 0:
        p1 = np.percentile(df_dt[tmpFieldName], exclude_percentile_offset, method='midpoint')        
        p99 = np.percentile(df_dt[tmpFieldName], 100-exclude_percentile_offset, method='midpoint')
        print('Range excluding outliers (',100-2*exclude_percentile_offset,'%):',p1,'->',p99)
        localvalues = df_dt[(df_dt[tmpFieldName]>p1) & (df_dt[tmpFieldName]<p99)][tmpFieldName]
        
        if len(localvalues) == 0:
            print("Error Calculating Outliers, rollback to full data")
            localvalues = df_dt[tmpFieldName]
    else:   
        localvalues = df_dt[tmpFieldName]

    local_min = localvalues.min()
    nullValue = local_min-1
    print("NullValue: ", nullValue)
    print("NullCount: ", localvalues.isna().sum())
    
    localvalues.fillna(nullValue, inplace=True)

    local_buckets_count = int(localvalues.max() - localvalues.min()+1)
    buckets_count = min(buckets_count, local_buckets_count)

    print('#Buckets : ', buckets_count)

    bins_tmp = np.linspace(localvalues.min(), localvalues.max(), buckets_count)
    bins_tmp = np.around(bins_tmp, decimals=0)

    f_hist = np.histogram(localvalues, bins=bins_tmp)
    
    plt.subplot(1, 2, 1)
    plt.xticks(rotation=90, fontsize=5)
    plt.bar(x=f_hist[1][1:].astype(str), height=f_hist[0])

    p_hist = np.histogram(df_dt[df_dt[baseField] == 1][tmpFieldName], bins=bins_tmp)
    p_factor = p_hist[0]/f_hist[0]

    plt.subplot(1, 2, 2)
    plt.scatter(f_hist[1][1:], p_factor, marker = ".")

    plt.show()

    df_table = pd.DataFrame({"upper": f_hist[1][1:], "total":f_hist[0], "posi": p_hist[0], "%": p_factor})

    np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
    print(df_table.to_markdown())


In [7]:
def export_prediction(y_test, y_predicted_proba):
    if y_predicted_proba.ndim == 1:
        y_predicted_f = y_predicted_proba
    else:
        y_predicted_f = y_predicted_proba[:,1]
    

In [8]:
from math import pi
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score

def evaluate_metrics(y_test, y_predicted_proba, threshold):
    if y_predicted_proba.ndim == 1:
        y_predicted = (y_predicted_proba >= threshold).astype(bool) 
    else:
        y_predicted = (y_predicted_proba[:,1] >= threshold).astype(bool) 

    cm = confusion_matrix(y_test, y_predicted)

    tn, fp, fn, tp = cm.ravel()

    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    tnr = tn/(tn+fp)
    accuracy = (tp+tn)/(tp+tn+fp+fn)
    f1 = 2*precision*recall/(precision+recall)
    # mcc = (tp*tn-fp*fn)/math.sqrt( (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn) )
    mcc = matthews_corrcoef(y_test, y_predicted)
    auc = roc_auc_score(y_test, y_predicted)



    ### Radar Graph
    df = pd.DataFrame({
    'group': ['Metrics'],
    'Accuracy: '+"{:.2f}".format(100*accuracy): [100*accuracy],
    'Precision: '+"{:.2f}".format(100*precision): [100*precision],
    'Recall: '+"{:.2f}".format(100*recall): [100*recall],
    'TNR: '+"{:.2f}".format(100*tnr): [100*tnr],
    'F1: '+"{:.2f}".format(100*f1): [100*f1],
    'MCC: '+"{:.2f}".format(100*mcc): [100*mcc],
    'AUC: '+"{:.2f}".format(100*auc): [100*auc]
    })

    categories=list(df)[1:]
    N = len(categories)

    values=df.loc[0].drop('group').values.flatten().tolist()
    values += values[:1]

    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]

    ax = plt.subplot(111, polar=True)

    plt.xticks(angles[:-1], categories, color='black', size=8)
    plt.yticks([10,20,30,40,50,60,70,80,90], ["","","","","","","","",""], color="grey", size=7)

    
    plt.ylim(0,100)

    ax.plot(angles, values, linewidth=1, linestyle='solid')

    ax.fill(angles, values, 'b', alpha=0.1)

    plt.show()

    # End Radar


    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()

    print("========= Prediction on Test Set =========")
    print(cm)
    print("-----------")
    print(accuracy, " <-- Accuracy (all correct / all)")
    print(precision, " <-- Precision: (true positives / predicted positives)")
    print(recall, " <-- Recall/TPR (true positives / all actual positives):")
    print(tnr, " <-- TNR (true negatives / all actual negatives):")
    print(f1, " <-- F1" )
    print(mcc, " <-- MCC [-1..0..1]" )
    print(auc, " <-- AUC" )
    plt.show()


In [9]:
def false_negative_list(y_test, y_predicted_proba, threshold, print_values_list=False):
    tmp = pd.DataFrame({'y': y_test, 'predicted': y_predicted_proba[:,1]}).sort_values(by=['predicted'])
    tmp = tmp[tmp["predicted"]<threshold]
    tmp = tmp[tmp["y"]==1]

    plt.bar(height=tmp["predicted"], x=np.arange(start=1, stop=len(tmp)+1, step=1), color ='maroon', width = 0.4)
    plt.show()
    print("Count: ", len(tmp))

    if print_values_list:
        print(tmp.to_markdown()) 

In [10]:
def false_positive_list(y_test, y_predicted_proba, threshold, print_values_list=False):
    tmp = pd.DataFrame({'y': y_test, 'predicted': y_predicted_proba[:,1]}).sort_values(by=['predicted'])
    tmp = tmp[tmp["predicted"]>threshold]
    tmp = tmp[tmp["y"]==0]

    plt.bar(height=tmp["predicted"], x=np.arange(start=1, stop=len(tmp)+1, step=1), color ='maroon', width = 0.4)
    plt.show()
    print("Count: ", len(tmp))
    
    if print_values_list:
        print(tmp.to_markdown()) 

In [11]:
def real_positive_list(y_test, y_predicted_proba, print_values_list=False, range_start=0, range_end=10):
    tmp = pd.DataFrame({'y': y_test, 'predicted': y_predicted_proba[:,1]}).sort_values(by=['predicted'], ascending=False)
    # tmp_n = tmp[tmp["y"]==0]
    tmp_p = tmp[tmp["y"]==1]

    plt.bar(height=tmp_p["predicted"], x=np.arange(start=1, stop=len(tmp_p)+1, step=1), color ='maroon', width = 0.4)
    plt.show()

    print("Count: ", len(tmp_p))

    if print_values_list:
        print(tmp.iloc[range_start:range_end].to_markdown()) 
    #print(tmp.to_markdown()) 

In [12]:
def full_histogram(y_test, y_predicted_proba, buckets_count=1000, graph_negative_threshold=0.5, print_values_list=False):
    if y_predicted_proba.ndim == 1:
        y_predicted = y_predicted_proba
    else:
        y_predicted = y_predicted_proba[:,1]


    # y_test_pred = y_test.copy(deep=True)
    # y_test_pred['prediction'] = y_predicted_proba[:,1]
    # y_test_pred = y_test_pred.sort_values(by=['prediction'])
    frame = {'IsFraud': y_test.copy(deep=True),
            'prediction': y_predicted}

    y_test_pred = pd.DataFrame(frame)

    y_test_pred = y_test_pred.sort_values(by=['prediction'])

    bins = np.linspace(0, 1, buckets_count)
    plt.figure(figsize=(40, 10))
    b_width = 0.001

    ########


    y_neg = y_test_pred[(y_test_pred['IsFraud']==0) & (y_test_pred["prediction"]>graph_negative_threshold)]

    n_hist = np.histogram(y_neg['prediction'], bins=bins)
    plt.bar(height=n_hist[0], x=np.linspace(0, 1, buckets_count-1)+b_width, color ='green', width = b_width)

    #######

    y_pos = y_test_pred[y_test_pred['IsFraud']==1]

    p_hist = np.histogram(y_pos['prediction'], bins=bins)
    plt.bar(height=p_hist[0], x=np.linspace(0, 1, buckets_count-1), color ='red', width = b_width)

    ########


    plt.show()

    print("RED: Fraud Label Positive;", "Showing: ", len(y_pos))
    print("GREEN: Fraud Label Negative;", "Showing: ", len(y_neg), " out of ", len(y_test_pred[y_test_pred['IsFraud']==0]))
    print("For visual purposes ", len(y_test_pred[y_test_pred['IsFraud']==0])-len(y_neg), " real negative records with prediction < ", graph_negative_threshold, " are excluded")

    if print_values_list:
        print("=============================")
        print("Real Positive predictions")
        print(y_pos.to_markdown()) 
        print("=============================")
        print("Real Negative predictions")
        print(y_neg.to_markdown()) 


In [13]:
def positive_near_zero_report(y_test, y_predicted_proba, near_zero_threshold=0.001):
    tmp_p = pd.DataFrame({'y': y_test, 'predicted': y_predicted_proba[:,1]}).sort_values(by=['predicted'])
    tmp_p = tmp_p[(tmp_p["y"]==1) & (tmp_p["predicted"]<near_zero_threshold)]

    print(tmp_p["predicted"])

    indexes = tmp_p.index.to_list()

    return indexes

In [14]:
def negative_near_one_report(y_test, y_predicted_proba, near_one_threshold=0.009):
    tmp_p = pd.DataFrame({'y': y_test, 'predicted': y_predicted_proba[:,1]}).sort_values(by=['predicted'])
    tmp_p = tmp_p[(tmp_p["y"]==0) & (tmp_p["predicted"]>near_one_threshold)]

    indexes = tmp_p.index.to_list()

    return indexes

In [15]:
def positive_near_one_report(y_test, y_predicted_proba, near_one_threshold=0.009):
    tmp_p = pd.DataFrame({'y': y_test, 'predicted': y_predicted_proba[:,1]}).sort_values(by=['predicted'])
    tmp_p = tmp_p[(tmp_p["y"]==1) & (tmp_p["predicted"]>near_one_threshold)]

    indexes = tmp_p.index.to_list()

    return indexes

In [16]:
#import association_metrics as am

def graphCorrelationCategorical(df_dt, baseField):
    categories_df = df_dt.select_dtypes(include=["category", "bool"])
    categories_df.insert(loc=0, column=baseField, value=df_dt[baseField])
    corr_cat = categories_df.apply(lambda x : pd.factorize(x)[0]).corr(method='pearson', min_periods=1)

    plt.subplot(2, 1, 1)
    sns.heatmap(corr_cat, annot=True, xticklabels=True, yticklabels=True)
    
    corr_cat = corr_cat.unstack().abs().sort_values(ascending=False)

    plt.subplot(2, 1, 2)
    corr_cat[baseField].plot.bar(legend=False, figsize=(20, 20))

    plt.show()


    print(corr_cat[baseField].to_markdown())

In [17]:
def graphCorrelationNumerical(df_dt, baseField):
    numerics_df = df_dt.select_dtypes(include="number")
    corr = numerics_df.corr()

    plt.subplot(2, 1, 1)
    sns.heatmap(corr, annot=True, xticklabels=True, yticklabels=True)


    corr = corr.unstack().abs().sort_values(ascending=False)

    plt.subplot(2, 1, 2)
    corr[baseField].plot.bar(legend=False, figsize=(20, 20))

    plt.show()
    
    print(corr[baseField].to_markdown())


In [18]:
def graphCorrelationGeneral(df_dt, baseField):
    general_df = df_dt.select_dtypes(include=["number", "category", "bool"])
    corr_gen = general_df.apply(lambda x : pd.factorize(x)[0]).corr(method='pearson', min_periods=1)

    plt.subplot(2, 1, 1)
    sns.heatmap(corr_gen, annot=False, xticklabels=True, yticklabels=True)
    
    corr_cat = corr_gen.unstack().abs().sort_values(ascending=False)

    corr_gen = corr_gen.unstack().abs().sort_values(ascending=False)

    plt.subplot(2, 1, 2)
    corr_gen[baseField].plot.bar(legend=False, figsize=(20, 20))

    plt.show()


    print(corr_gen[baseField].to_markdown())