# 2. Quality Assessement

### Get the statistical data of all classifiers

In [32]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import re
from scipy import stats

In [15]:
# process all the exportet quality files from GEE
def process_csv_files(directory):
    results = []

    for root, dirs, files in os.walk(directory):
        for filename in files:
            if filename.endswith(".csv") and filename.startswith("ConfusionMatrix"):
                filepath = os.path.join(root, filename)
                
                # Extract classifier and date from filename
                classifier = re.search(r'ConfusionMatrix_(.*?)_\d{4}-\d{2}-\d{2}\.csv', filename).group(1)
                date = re.search(r'ConfusionMatrix_.*?_(\d{4}-\d{2}-\d{2})\.csv', filename).group(1)
                
                # Read CSV file
                df = pd.read_csv(filepath)
                
                # Extract matrix from the first row (ignoring first element in each row)
                matrix = eval(df.iloc[0]['matrix'])
                
                # Initialize a dictionary to store data for each element in the matrix
                matrix_data = {}
                for i in range(1, len(matrix)):  # Start from 1 to skip the first row
                    for j in range(1, len(matrix[i])):  # Start from 1 to skip the first element in each row
                        col_name = f'cm_{i}_{j}'  # Generate column name based on matrix index (0-based)
                        matrix_data[col_name] = matrix[i][j]
                
                # Append classifier and date to matrix data
                matrix_data.update({
                    'filename': filename,
                    'classifier': classifier,
                    'date': date
                })
                
                results.append(matrix_data)
    
    # Create DataFrame from results and write to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv(os.path.join(directory, 'Quality_7K.csv'), index=False)

# Beispielaufruf der Funktion
directory_path = r"path"
process_csv_files(directory_path)

In [54]:
# read in csv in case you start here
df = pd.read_csv(r"quality_path", delimiter=',')

In [55]:
df.columns

Index(['cm_1_1', 'cm_1_2', 'cm_1_3', 'cm_1_4', 'cm_1_5', 'cm_1_6', 'cm_1_7',
       'cm_2_1', 'cm_2_2', 'cm_2_3', 'cm_2_4', 'cm_2_5', 'cm_2_6', 'cm_2_7',
       'cm_3_1', 'cm_3_2', 'cm_3_3', 'cm_3_4', 'cm_3_5', 'cm_3_6', 'cm_3_7',
       'cm_4_1', 'cm_4_2', 'cm_4_3', 'cm_4_4', 'cm_4_5', 'cm_4_6', 'cm_4_7',
       'cm_5_1', 'cm_5_2', 'cm_5_3', 'cm_5_4', 'cm_5_5', 'cm_5_6', 'cm_5_7',
       'cm_6_1', 'cm_6_2', 'cm_6_3', 'cm_6_4', 'cm_6_5', 'cm_6_6', 'cm_6_7',
       'cm_7_1', 'cm_7_2', 'cm_7_3', 'cm_7_4', 'cm_7_5', 'cm_7_6', 'cm_7_7',
       'filename', 'classifier', 'date'],
      dtype='object')

In [56]:
# function to compute a confusion matrix from the raw data
def get_confusion_matrix(row_number, results_df):
    row = results_df.iloc[row_number]
    
    num_classes = 7
    confusion_matrix = np.zeros((num_classes, num_classes), dtype=int)
    
    for i in range(1, num_classes + 1):
        for j in range(1, num_classes + 1):
            confusion_matrix[i-1, j-1] = row[f'cm_{i}_{j}']
    
    return confusion_matrix

mat1 = get_confusion_matrix(1, df)

print(mat1)

[[343   0   4   0   0   0   4]
 [  0   6   0   0   0   0   0]
 [  1   0 335   0   8   0   3]
 [  6   0   0 317   0   0   0]
 [  0   0   0   0 310   0   1]
 [ 24  11  39  13 122  49  79]
 [  5   0  14   0   0   0 306]]


In [57]:
# get required data
true_positives = np.diag(mat1)

actual = np.sum(mat1, axis=1)

predicted = np.sum(mat1, axis=0)

total = np.sum(mat1)

accuracies = true_positives / actual

accuracy = np.sum(true_positives) / total

print(f"Accuracy Score (Overall): {accuracy}")
print("Accuracy Score (Per Class):")
for i, acc in enumerate(accuracies):
    print(f"Class {i+1}: {acc}")

Accuracy Score (Overall): 0.833
Accuracy Score (Per Class):
Class 1: 0.9772079772079773
Class 2: 1.0
Class 3: 0.9654178674351584
Class 4: 0.9814241486068112
Class 5: 0.9967845659163987
Class 6: 0.14540059347181009
Class 7: 0.9415384615384615


### Prepare the quality data

In [58]:
# we can also use prewritten functions
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score

In [65]:
def calculate_accuracy(row):
    num_classes = 7
    
    mat = np.zeros((num_classes, num_classes), dtype=int)
    
    for i in range(1, num_classes + 1):
        for j in range(1, num_classes + 1):
            mat[i-1, j-1] = row[f'cm_{i}_{j}']
    
    
    true_positives = np.diag(mat)
    total = np.sum(mat)
    
    acc = np.sum(true_positives) / total
    return acc

In [69]:
def calculate_kappa(row):
    num_classes = 7
    
    mat = np.zeros((num_classes, num_classes), dtype=int)
    
    for i in range(1, num_classes + 1):
        for j in range(1, num_classes + 1):
            mat[i-1, j-1] = row[f'cm_{i}_{j}']
    
    total = np.sum(mat)
    
    observed_agreement = np.trace(mat) / total
    
    row_marginals = np.sum(mat, axis=1) / total
    column_marginals = np.sum(mat, axis=0) / total
    expected_agreement = np.sum(row_marginals * column_marginals)
    
    kappa = (observed_agreement - expected_agreement) / (1 - expected_agreement)
    
    return kappa

In [73]:
def calculate_f1(row):
    num_classes = 7
    
    mat = np.zeros((num_classes, num_classes), dtype=int)
    
    for i in range(1, num_classes + 1):
        for j in range(1, num_classes + 1):
            mat[i-1, j-1] = row[f'cm_{i}_{j}']
    
    precision = np.zeros(num_classes)
    recall = np.zeros(num_classes)
    f1_scores = np.zeros(num_classes)
    
    for i in range(num_classes):
        tp = mat[i, i]
        fp = np.sum(mat[:, i]) - tp
        fn = np.sum(mat[i, :]) - tp
        
        # Avoid division by zero
        if tp + fp > 0:
            precision[i] = tp / (tp + fp)
        if tp + fn > 0:
            recall[i] = tp / (tp + fn)
        
        if precision[i] + recall[i] > 0:
            f1_scores[i] = 2 * (precision[i] * recall[i]) / (precision[i] + recall[i])
    
    # Macro F1-score
    macro_f1 = np.mean(f1_scores)
    
    return macro_f1

In [74]:
df['accuracy'] = df.apply(calculate_accuracy, axis=1)
df['kappa'] = df.apply(calculate_kappa, axis=1)
df['fscore'] = df.apply(calculate_f1, axis=1)

In [75]:
print(df.head(20))

    cm_1_1  cm_1_2  cm_1_3  cm_1_4  cm_1_5  cm_1_6  cm_1_7  cm_2_1  cm_2_2  \
0      442       0      17       0       8       0      52       0       0   
1      343       0       4       0       0       0       4       0       6   
2      483       0      18       0       0       0      18       0       0   
3      474       0      19       0       0       0      26       0       0   
4      471       0      23       0       0       0      25       0       0   
5      177       0      51       0       4       0     287       0       0   
6      453       0      27       4       4       4      19       0       5   
7      327       0       8       0       0       0       1       0     146   
8      472       0      28       2       0       0       9       0       6   
9      442       0      36       4       4       0      25       0       3   
10     461       0      35       4       0       0      11       0       6   
11     339       2      46       2      66       0      56      

In [77]:
df.to_csv(r'C:\Users\MS\OneDrive\Studium\Geographie\Geo-BA\Data\Classification\Quality\Quality_7K_with_metrics.csv', index=False)

### Make table of all the statistical parameters about the quality assessement

In [78]:
# load the data in case yout start here
df_qa = pd.read_csv(r'C:\Users\MS\OneDrive\Studium\Geographie\Geo-BA\Data\Classification\Quality\Quality_7K_with_metrics.csv', delimiter=',')

In [79]:
df_qa = df_qa[['classifier', 'accuracy', 'kappa', 'fscore']]

In [80]:
print(df_qa)

    classifier  accuracy     kappa    fscore
0         CART  0.855769  0.809793  0.692070
1         Cons  0.833000  0.800190  0.752974
2          GBT  0.924679  0.900694  0.707315
3          KNN  0.864316  0.820849  0.631196
4           RF  0.910791  0.882468  0.699373
..         ...       ...       ...       ...
487       Cons  0.901000  0.880937  0.756117
488        GBT  0.993341  0.991209  0.813071
489        KNN  0.960599  0.947966  0.762655
490         RF  0.991676  0.989009  0.798170
491        SVM  0.886238  0.849216  0.718471

[492 rows x 4 columns]


In [93]:
# make table of the data we care about

pivot_df = df.pivot_table(index=df.index, columns='classifier', values=['accuracy', 'kappa', 'fscore'])

pivot_df.columns = [f'{metric} {classifier}' for metric, classifier in pivot_df.columns]

quality_stats = pivot_df.describe().T

quality_stats = quality_stats.round(2)

print(quality_stats)

               count  mean   std   min   25%   50%   75%   max
accuracy CART   82.0  0.92  0.04  0.80  0.90  0.93  0.95  0.99
accuracy Cons   82.0  0.83  0.04  0.71  0.81  0.83  0.85  0.92
accuracy GBT    82.0  0.96  0.03  0.87  0.94  0.97  0.98  1.00
accuracy KNN    82.0  0.91  0.04  0.80  0.88  0.91  0.93  0.98
accuracy RF     82.0  0.95  0.03  0.86  0.93  0.96  0.97  1.00
accuracy SVM    82.0  0.86  0.08  0.58  0.80  0.86  0.92  0.98
fscore CART     82.0  0.83  0.06  0.66  0.81  0.83  0.86  0.94
fscore Cons     82.0  0.77  0.07  0.54  0.75  0.78  0.81  0.89
fscore GBT      82.0  0.88  0.06  0.70  0.84  0.90  0.93  0.98
fscore KNN      82.0  0.73  0.08  0.50  0.68  0.75  0.79  0.86
fscore RF       82.0  0.85  0.06  0.70  0.82  0.86  0.89  0.96
fscore SVM      82.0  0.77  0.09  0.44  0.71  0.79  0.84  0.93
kappa CART      82.0  0.90  0.05  0.74  0.87  0.91  0.94  0.99
kappa Cons      82.0  0.80  0.04  0.65  0.77  0.80  0.82  0.91
kappa GBT       82.0  0.95  0.04  0.84  0.92  0.96  0.9

In [94]:
# export table
quality_stats.to_csv(r'C:\Users\MS\OneDrive\Studium\Geographie\Geo-BA\Data\Classification\Quality\Quality_7K_statistics.csv')