In [44]:
import pandas as pd
import numpy as np
import matplotlib as mp
import random

In [4]:
''' PARAMETERS '''
# INPUT_FILE = f'{var1}/{var2}'
INPUT_FILE = f'enunciado/diabetes.csv'


In [171]:
def normalize(df, cols):
    '''
    df: Input dataframe
    cols: Columns to normalize
    '''
    for col in cols:
        val_max = max(df[col])
        val_min = min(df[col])
        df[col] = (df[col] - val_min)/(val_max - val_min)
        # print(f'Column: {col}, min: {val_min}, max:{val_max}.')
    return df

In [172]:
def get_folds(k=5, df=None, col=''):
    
    df['fold'] = -1
    
    df_w_folds = pd.DataFrame(columns=df.columns)
    
    uniq_class_list = df[col].unique()
    
    for val in uniq_class_list:
        df_temp = df[df[col] == val]
        ar_fold = np.arange(len(df_temp))
        random.shuffle(ar_fold)
        
        # Split array in n folds 
        f = lambda x: x%k+1
        ar_fold = f(ar_fold)
        df_temp['fold'] = ar_fold
        df_w_folds = df_w_folds.append(df_temp)
    
    df_w_folds = df_w_folds.sort_values(by=['fold'])
    return df_w_folds

In [185]:
def cross_validation(k_fold, ar_k_knn, df, cols, col_real, beta):
    df = get_folds(k_fold, df, col_real)
        
    labels = df[col_real].unique()
    labels.sort()
    classes_size = len(labels)
    pos_lab_dict = dict(zip(labels, [i for i in range(classes_size)]))    
#     print(pos_lab_dict)
    for k_knn in ar_k_knn:
        print(f'K for k-NN: {k_knn}')
        df_out = pd.DataFrame(columns=['fold', 'accuracia', 'f-measure'])
        for fold in range(k_fold):
            df_train = df[df['fold'] != fold+1]
            df_test = df[df['fold'] == fold+1]
    #         print(f'df_train: {len(df_train)}, df_test: {len(df_test)}')
            ''' create the confusion matrix:
            Rows: Real Values
            Cols: Predicted Values
            '''
            confusion_matrix = np.zeros((classes_size, classes_size))

            for index, row in df_test.iterrows():
                pred = knn(k=k_knn, df_train=df_train, row_test=row, cols=cols, col_real=col_real)
                # Get position in the confusion matrix of prediction
                col_pred_pos = pos_lab_dict[pred]
                row_real_pos = pos_lab_dict[row[col_real]]
                confusion_matrix[row_real_pos][col_pred_pos] += 1

    #         print(confusion_matrix)

            # Diagonal contains the right classification    
            acc = np.trace(confusion_matrix)/np.sum(confusion_matrix) 
            prec_accum = 0
            rec_accum = 0
    #         for label in labels:
            for label in [1]:
                pos = pos_lab_dict[label]
                # VP/VP+FP
                prec_accum = confusion_matrix[pos][pos] / sum( confusion_matrix[i][pos] for i in range(classes_size)) 
                # VP/VP+FN
                rec_accum = confusion_matrix[pos][pos] / sum( confusion_matrix[pos][i] for i in range(classes_size)) 

            ''' f (macro): The average of all precision and recall: For multiclass '''
            prec = prec_accum #/ classes_size
            rec = rec_accum #/ classes_size
    #         print(f'prec: {prec}, rec: {rec}')
            f1 = (1+beta**2)*( (prec*rec) / ((beta**2)*prec + rec ) )
            df_out = df_out.append({'fold':(int)(fold+1), 'accuracia': acc, 'f-measure': f1}, ignore_index=True)

        print(df_out)
        avg_acc = np.average(df_out['accuracia'])
        avg_f1 = np.average(df_out['f-measure'])
        std_acc = np.std(df_out['accuracia'])
        std_f1 = np.std(df_out['f-measure'])

        print(f'Average Acc: {avg_acc}, Average f1-measure: {avg_f1}')
        print(f'Std Acc: {std_acc}, Std f1-measure: {std_f1}')

    #     return df_out

In [186]:
def knn(k=5, df_train=None, row_test=None, cols=[], col_real=''):
    df = df_train.copy()
    df['euc_dist'] = 0
    for col in cols:
        df['euc_dist'] += (df[col]-row_test[col])**2
    df['euc_dist'] = np.sqrt(df['euc_dist'])
    df = df.sort_values(by=['euc_dist'], ascending=True)
    
    # Get the k nearest possible values
    res_list = df.head(k)[col_real].to_list()
    # Return the most repeated element
    return max(set(res_list), key = res_list.count)

#     print(df.head(k).to_list)
#     print(row_test)
#     return df

## INIT
Call main

In [187]:
''' Read dataframe '''
df = pd.read_csv(INPUT_FILE)
# Columns to normalize 
ar_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 
              'DiabetesPedigreeFunction', 'Age']
df_norm = normalize(df, ar_columns)
r = 10
ar_k_knn = [3, 5, 7]
for i in range(r):
    print(f'R: {i+1}')
    cross_validation(10, ar_k_knn, df_norm, ar_columns, 'Outcome', 1)

R: 1
K for k-NN: 3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


   fold  accuracia  f-measure
0   1.0   0.792208   0.692308
Average Acc: 0.7922077922077922, Average f1-measure: 0.6923076923076923
Std Acc: 0.0, Std f1-measure: 0.0
   fold  accuracia  f-measure
0   1.0   0.792208   0.692308
1   2.0   0.792208   0.666667
Average Acc: 0.7922077922077922, Average f1-measure: 0.6794871794871795
Std Acc: 0.0, Std f1-measure: 0.012820512820512832
   fold  accuracia  f-measure
0   1.0   0.792208   0.692308
1   2.0   0.792208   0.666667
2   3.0   0.649351   0.584615
Average Acc: 0.7445887445887447, Average f1-measure: 0.6478632478632479
Std Acc: 0.0673435029701474, Std f1-measure: 0.045931722538613315
   fold  accuracia  f-measure
0   1.0   0.792208   0.692308
1   2.0   0.792208   0.666667
2   3.0   0.649351   0.584615
3   4.0   0.688312   0.500000
Average Acc: 0.7305194805194806, Average f1-measure: 0.610897435897436
Std Acc: 0.06320754004523309, Std f1-measure: 0.07537709276961019
   fold  accuracia  f-measure
0   1.0   0.792208   0.692308
1   2.0   0.7922

   fold  accuracia  f-measure
0   1.0   0.857143   0.775510
1   2.0   0.792208   0.680000
2   3.0   0.727273   0.644068
3   4.0   0.649351   0.470588
4   5.0   0.610390   0.444444
5   6.0   0.727273   0.571429
6   7.0   0.740260   0.583333
7   8.0   0.805195   0.693878
Average Acc: 0.7386363636363636, Average f1-measure: 0.6079062670265847
Std Acc: 0.07584835315729654, Std f1-measure: 0.1057696580847704
   fold  accuracia  f-measure
0   1.0   0.857143   0.775510
1   2.0   0.792208   0.680000
2   3.0   0.727273   0.644068
3   4.0   0.649351   0.470588
4   5.0   0.610390   0.444444
5   6.0   0.727273   0.571429
6   7.0   0.740260   0.583333
7   8.0   0.805195   0.693878
8   9.0   0.802632   0.634146
Average Acc: 0.7457469431153642, Average f1-measure: 0.6108218308528991
Std Acc: 0.0742848316589217, Std f1-measure: 0.10006098222005333
   fold  accuracia  f-measure
0   1.0   0.857143   0.775510
1   2.0   0.792208   0.680000
2   3.0   0.727273   0.644068
3   4.0   0.649351   0.470588
4   5.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


   fold  accuracia  f-measure
0   1.0   0.766234   0.678571
Average Acc: 0.7662337662337663, Average f1-measure: 0.6785714285714286
Std Acc: 0.0, Std f1-measure: 0.0
   fold  accuracia  f-measure
0   1.0   0.766234   0.678571
1   2.0   0.779221   0.701754
Average Acc: 0.7727272727272727, Average f1-measure: 0.6901629072681704
Std Acc: 0.006493506493506496, Std f1-measure: 0.01159147869674182
   fold  accuracia  f-measure
0   1.0   0.766234   0.678571
1   2.0   0.779221   0.701754
2   3.0   0.792208   0.680000
Average Acc: 0.7792207792207791, Average f1-measure: 0.6867752715121136
Std Acc: 0.010603851700360082, Std f1-measure: 0.010607877777832939
   fold  accuracia  f-measure
0   1.0   0.766234   0.678571
1   2.0   0.779221   0.701754
2   3.0   0.792208   0.680000
3   4.0   0.740260   0.600000
Average Acc: 0.7694805194805194, Average f1-measure: 0.6650814536340852
Std Acc: 0.01920805124382993, Std f1-measure: 0.03868152667084479
   fold  accuracia  f-measure
0   1.0   0.766234   0.6785

   fold  accuracia  f-measure
0   1.0   0.714286   0.576923
1   2.0   0.740260   0.615385
2   3.0   0.740260   0.600000
3   4.0   0.753247   0.577778
4   5.0   0.727273   0.588235
5   6.0   0.792208   0.692308
6   7.0   0.727273   0.533333
Average Acc: 0.7421150278293135, Average f1-measure: 0.5977088271205917
Std Acc: 0.02346773773779874, Std f1-measure: 0.04524516458094177
   fold  accuracia  f-measure
0   1.0   0.714286   0.576923
1   2.0   0.740260   0.615385
2   3.0   0.740260   0.600000
3   4.0   0.753247   0.577778
4   5.0   0.727273   0.588235
5   6.0   0.792208   0.692308
6   7.0   0.727273   0.533333
7   8.0   0.675325   0.390244
Average Acc: 0.7337662337662338, Average f1-measure: 0.5717757115353959
Std Acc: 0.031141763138394292, Std f1-measure: 0.08061587741194576
   fold  accuracia  f-measure
0   1.0   0.714286   0.576923
1   2.0   0.740260   0.615385
2   3.0   0.740260   0.600000
3   4.0   0.753247   0.577778
4   5.0   0.727273   0.588235
5   6.0   0.792208   0.692308
6  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


   fold  accuracia  f-measure
0   1.0    0.74026   0.565217
Average Acc: 0.7402597402597403, Average f1-measure: 0.5652173913043478
Std Acc: 0.0, Std f1-measure: 0.0
   fold  accuracia  f-measure
0   1.0   0.740260   0.565217
1   2.0   0.727273   0.631579
Average Acc: 0.7337662337662338, Average f1-measure: 0.5983981693363845
Std Acc: 0.006493506493506496, Std f1-measure: 0.03318077803203662
   fold  accuracia  f-measure
0   1.0   0.740260   0.565217
1   2.0   0.727273   0.631579
2   3.0   0.701299   0.581818
Average Acc: 0.722943722943723, Average f1-measure: 0.5928715068303169
Std Acc: 0.016197651024995423, Std f1-measure: 0.02819687956942756
   fold  accuracia  f-measure
0   1.0   0.740260   0.565217
1   2.0   0.727273   0.631579
2   3.0   0.701299   0.581818
3   4.0   0.688312   0.500000
Average Acc: 0.7142857142857143, Average f1-measure: 0.5696536301227377
Std Acc: 0.020534270520573897, Std f1-measure: 0.04704792672918382
   fold  accuracia  f-measure
0   1.0   0.740260   0.56521

KeyboardInterrupt: 