In [1]:
import pandas as pd
import numpy as np
import matplotlib as mp
import random

In [None]:
''' PARAMETERS '''
# INPUT_FILE = f'{var1}/{var2}'
INPUT_FILE = f'enunciado/diabetes.csv'


In [None]:
def normalize(df, cols):
    '''
    df: Input dataframe
    cols: Columns to normalize
    '''
    for col in cols:
        val_max = max(df[col])
        val_min = min(df[col])
        df[col] = (df[col] - val_min)/(val_max - val_min)
        # print(f'Column: {col}, min: {val_min}, max:{val_max}.')
    return df

In [None]:
def get_folds(k=5, df=None, col=''):
    
    df['fold'] = -1
    
    df_w_folds = pd.DataFrame(columns=df.columns)
    
    uniq_class_list = df[col].unique() #[0, 1, 2]
    
    for val in uniq_class_list:
        df_temp = df[df[col] == val]
        ar_fold = np.arange(len(df_temp))
        random.shuffle(ar_fold)
        
        # Split array in n folds 
        f = lambda x: x%k+1
        ar_fold = f(ar_fold)
        df_temp['fold'] = ar_fold
        df_w_folds = df_w_folds.append(df_temp)
    
    df_w_folds = df_w_folds.sort_values(by=['fold'])
    print(df_w_folds.head(30))
    return df_w_folds

In [None]:
def cross_validation(k_fold, ar_k_knn, df, cols, col_real, beta):
    df = get_folds(k_fold, df, col_real)
        
    labels = df[col_real].unique()
    labels.sort()
    classes_size = len(labels)
    pos_lab_dict = dict(zip(labels, [i for i in range(classes_size)]))    
#     print(pos_lab_dict)
    for k_knn in ar_k_knn:
        print(f'K for k-NN: {k_knn}')
        df_out = pd.DataFrame(columns=['fold', 'accuracia', 'f-measure'])
        for fold in range(k_fold):
            df_train = df[df['fold'] != fold+1]
            df_test = df[df['fold'] == fold+1]
    #         print(f'df_train: {len(df_train)}, df_test: {len(df_test)}')
            ''' create the confusion matrix:
            Rows: Real Values
            Cols: Predicted Values
            '''
            confusion_matrix = np.zeros((classes_size, classes_size))

            for index, row in df_test.iterrows():
                pred = knn(k=k_knn, df_train=df_train, row_test=row, cols=cols, col_real=col_real)
                # Get position in the confusion matrix of prediction
                col_pred_pos = pos_lab_dict[pred]
                row_real_pos = pos_lab_dict[row[col_real]]
                confusion_matrix[row_real_pos][col_pred_pos] += 1

    #         print(confusion_matrix)

            # Diagonal contains the right classification    
            acc = np.trace(confusion_matrix)/np.sum(confusion_matrix) 
            prec_accum = 0
            rec_accum = 0
    #         for label in labels:
            for label in [1]:
                pos = pos_lab_dict[label]
                # VP/VP+FP
                prec_accum = confusion_matrix[pos][pos] / sum( confusion_matrix[i][pos] for i in range(classes_size)) 
                # VP/VP+FN
                rec_accum = confusion_matrix[pos][pos] / sum( confusion_matrix[pos][i] for i in range(classes_size)) 

            ''' f (macro): The average of all precision and recall: For multiclass '''
            prec = prec_accum #/ classes_size
            rec = rec_accum #/ classes_size
    #         print(f'prec: {prec}, rec: {rec}')
            f1 = (1+beta**2)*( (prec*rec) / ((beta**2)*prec + rec ) )
            df_out = df_out.append({'fold':(int)(fold+1), 'accuracia': acc, 'f-measure': f1}, ignore_index=True)

        print(df_out)
        avg_acc = np.average(df_out['accuracia'])
        avg_f1 = np.average(df_out['f-measure'])
        std_acc = np.std(df_out['accuracia'])
        std_f1 = np.std(df_out['f-measure'])

        print(f'Average Acc: {avg_acc}, Average f1-measure: {avg_f1}')
        print(f'Std Acc: {std_acc}, Std f1-measure: {std_f1}')

    #     return df_out

In [None]:
def knn(k=5, df_train=None, row_test=None, cols=[], col_real=''):
    df = df_train.copy()
    df['euc_dist'] = 0
    for col in cols:
        df['euc_dist'] += (df[col]-row_test[col])**2
    df['euc_dist'] = np.sqrt(df['euc_dist'])
    df = df.sort_values(by=['euc_dist'], ascending=True)
    
    # Get the k nearest possible values
    res_list = df.head(k)[col_real].to_list()
    # Return the most repeated element
    return max(set(res_list), key = res_list.count)

#     print(df.head(k).to_list)
#     print(row_test)
#     return df

## INIT
Call main

In [None]:
''' Read dataframe '''
df = pd.read_csv(INPUT_FILE)
# Columns to normalize 
ar_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 
              'DiabetesPedigreeFunction', 'Age']
df_norm = normalize(df, ar_columns)
r = 10
ar_k_knn = [3, 5, 7]
for i in range(r):
    print(f'R: {i+1}')
    cross_validation(10, ar_k_knn, df_norm, ar_columns, 'Outcome', 1)