In [217]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
import pandas as pd

In [218]:
data = spark.read.csv('results_for_accuracy_calc.csv', header=True)

In [219]:
data.limit(10).toPandas()

Unnamed: 0,cpf_a,cpf_b,similarity
0,097.627.958-49,097.627.958-49,1
1,685.096.335-09,685.096.335-09,1
2,956.997.968-27,956.997.968-27,1
3,631.875.885-22,631.875.885-22,1
4,397.480.478-14,397.480.478-14,1
5,216.419.173-00,216.419.173-00,1
6,148.386.650-59,148.386.650-59,1
7,654.325.863-20,654.325.863-20,1
8,704.817.633-41,704.817.633-42,1
9,179.057.616-49,179.057.6-49,1


In [220]:
# Let us consider a cuttoff point set as 0.85
cutoff = 0.9

# sorting and deduplicating the resulting dataset
data = data.withColumn('similairty', F.col('similarity').cast(DoubleType()))
data = data.orderBy('similarity').dropDuplicates(['cpf_b'])
data = data.withColumn('match', F.when(F.col('similarity') >= cutoff, '1').otherwise('0'))

In [221]:
def inspect_pairs(cpf_a, cpf_b, match):
    if match == '1':
        if cpf_a == cpf_b:
            return "TP"
        else:
            return "FP"
    else:
        if cpf_a != cpf_b:
            return "TN"
        else: 
            return "FN"
udf_inspect_pairs = F.udf(inspect_pairs, StringType())

In [222]:
data = data.withColumn('perf', udf_inspect_pairs(F.col('cpf_a'), F.col('cpf_b'), F.col('match')))

## Measures
Accuracy = (TP + TN) / (FP + TP + FN + TN)

Positive predictive value (PPV) = TP / (TP + FP)

Negative predictive value (NPV) = TN / (TN + FN)

Sensitivity (True positive rate) = TP / (TP + FN)

Specificity (True negative rate) = TN / (TN + FP)

In [224]:
dic_results = {}
TP = data.filter(F.col('perf') == "TP").count()
TN = data.filter(F.col('perf') == "TN").count()
FP = data.filter(F.col('perf') == "FP").count()
FN = data.filter(F.col('perf') == "FN").count()


dic_results['accuracy'] =  float(TP + TN) / (FP + TP + FN + TN)
dic_results['ppv'] = float(TP) / (TP + FP)
dic_results['npv'] = float(TN) / (TN + FN)
dic_results['sens'] = float(TP) / (TP + FN)
dic_results['spec'] = float(TN) / (TN + FP)

In [225]:
final_results = pd.DataFrame(dic_results, index=[0])

In [226]:
final_results

Unnamed: 0,accuracy,npv,ppv,sens,spec
0,0.51049,0.403509,0.931034,0.284211,0.958333
