In [5]:
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
from collections import Counter

directory_path = '/drive2/kaggle/pii-dd/piidd/training/basic'
directory = Path(directory_path)

# Using rglob to find all CSV files recursively
csv_files = list(directory.rglob('*.csv'))


fp_info = set()

unq_doc_token = set()

fp_docs = set()
fp_counter = Counter()

# Printing the paths of found CSV files
for csv_file in tqdm(csv_files):
    df = pd.read_csv(csv_file, engine="python")

    if "cm" in df.columns:
        FP = (df["cm"].isin({"FP", "FNFP"})).sum()
        FN = (df["cm"].isin({"FN", "FNFP"})).sum()
        TP = (df["cm"] == "TP").sum()   
        beta = 5
        s_micro = (1 + (beta**2)) * TP / (((1 + (beta**2)) * TP) + ((beta**2) * FN) + FP)

        if s_micro > 0.9:
            fp_docs.update(df.loc[df["cm"].isin({"FP", "FNFP"}), "document"].tolist())
            temp = [(w, x,y,z) for w, x,y,z in df.loc[df["cm"].isin({"FP", "FNFP"}), ["label_pred", "token", "document", "token_text_pred"]].values] 

            unq_doc_token.update([(y, z) for (w, x,y,z) in temp])
            fp_counter.update(set([(y, z) for (w, x,y,z) in temp]))
            fp_info.update(temp)

  0%|          | 0/811 [00:00<?, ?it/s]

In [11]:
fn_info = set()

unq_doc_token = set()

fn_docs = set()
fn_counter = Counter()

# Printing the paths of found CSV files
for csv_file in tqdm(csv_files):
    df = pd.read_csv(csv_file, engine="python")

    if "cm" in df.columns:
        FP = (df["cm"].isin({"FP", "FNFP"})).sum()
        FN = (df["cm"].isin({"FN", "FNFP"})).sum()
        TP = (df["cm"] == "TP").sum()   
        beta = 5
        s_micro = (1 + (beta**2)) * TP / (((1 + (beta**2)) * TP) + ((beta**2) * FN) + FP)

        if s_micro > 0.9:
            fn_docs.update(df.loc[df["cm"].isin({"FN"}), "document"].tolist())
            temp = [(w, x,y,z) for w, x,y,z in df.loc[df["cm"].isin({"FN"}), ["label_gt", "token", "document", "token_text_gt"]].values] 

            unq_doc_token.update([(y, z) for (w, x,y,z) in temp])
            fn_counter.update(set([(y, z) for (w, x,y,z) in temp]))
            fn_info.update(temp)

  0%|          | 0/811 [00:00<?, ?it/s]

In [12]:
len(fp_docs), len(fn_docs)

(905, 294)

In [13]:
fp_counter

Counter({(10376, 'Gamal'): 113,
         (10376, 'Simona'): 110,
         (11535, '\u200b'): 107,
         (12565, '\u200b'): 102,
         (3915, '\u200b'): 102,
         (11208, 'Glen'): 100,
         (4600, 'Yomna'): 100,
         (11208, 'Styles'): 98,
         (4600, 'Isabel'): 98,
         (3427, 'Katia'): 97,
         (3427, 'Rania'): 97,
         (2915, 'Carolina'): 96,
         (8302, 'Coyle'): 96,
         (4717, '221910309005'): 94,
         (4717, '221910309006'): 94,
         (11376, 'Javier'): 94,
         (4278, '\u200b'): 93,
         (6611, '\u200b'): 93,
         (9854, 'vpi@mn.nl'): 93,
         (10693, 'Jeremy'): 92,
         (5001, 'Rya'): 88,
         (10220, 'Maria'): 87,
         (7993, 'Ortega'): 87,
         (19280, '30407059'): 87,
         (8344, 'Janet'): 86,
         (8344, 'Galletti'): 86,
         (10251, '\u200b'): 86,
         (8302, 'Sibanda'): 86,
         (12803, 'Nickie'): 86,
         (12120, 'Madrid'): 85,
         (3894, '\u200b'): 85,
         

In [14]:
fn_counter

Counter({(3241, 'Rodriguez'): 119,
         (6117, 'Sharma'): 106,
         (6117, 'Hari'): 104,
         (9911, 'Mpanza'): 104,
         (9911, 'Cobus'): 103,
         (7779, 'Leroy'): 101,
         (10517, 'ras21'): 97,
         (12077, 'Daniel'): 95,
         (9854, '\n'): 95,
         (4351, 'Nazri'): 92,
         (7779, 'Sullivan'): 91,
         (12585, 'Art'): 88,
         (12585, 'Street'): 88,
         (4351, 'Marta'): 88,
         (6148, 'Christian'): 88,
         (9674, 'Princess'): 87,
         (4351, 'Sin'): 87,
         (12077, 'Kate'): 86,
         (12330, 'Miriam'): 86,
         (4351, 'Ivan'): 85,
         (3885, 'Irina'): 84,
         (4161, 'Ashley'): 83,
         (8798, 'Roberta'): 82,
         (12512, 'Jordi'): 82,
         (9324, 'Menna'): 81,
         (8747, 'Nandan'): 81,
         (11208, 'Suarez'): 81,
         (20984, 'V69230'): 78,
         (3885, 'Luca'): 75,
         (9920, 'Muhammad'): 75,
         (9286, 'Prakash'): 75,
         (4922, 'Tam'): 74,
        

In [15]:
fp_df = pd.DataFrame(fp_info, columns=["label_pred", "token", "document", "token_text_pred"])
fp_df

Unnamed: 0,label_pred,token,document,token_text_pred
0,I-NAME_STUDENT,352,20449,Riley
1,I-NAME_STUDENT,566,15749,Rosas
2,B-NAME_STUDENT,614,12267,BP
3,I-USERNAME,255,18270,​
4,B-NAME_STUDENT,805,11376,Fabio
...,...,...,...,...
3703,B-NAME_STUDENT,0,21538,Alex
3704,B-URL_PERSONAL,261,9117,https://otter.ai
3705,I-STREET_ADDRESS,8,121,","
3706,B-NAME_STUDENT,921,21316,Mike
