### evaluation
- compare the extracted entities to a handcoded dataset of 200 rows

In [1]:
import pandas as pd
import os
import sys

parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)

from utils import db_interaction

In [2]:
# Create a DataFrame from cleaned extracted dataset
df = pd.read_csv("../datasets/cleaned_extraced_data.csv")
df = df[['custom_id', 'name', 'url', 'abbreviation',
       ]]

In [3]:
path_training_data = "../datasets/inputs/cleaned_network_data.xlsx" 
df_training = pd.read_excel(path_training_data,sheet_name='training_solved')

df_training.rename(columns={'Identification code':'id'},inplace=True)
df_training['nr'] = df_training.groupby('id').cumcount() + 1
df_training = df_training.drop_duplicates()

df_training_list = df_training['id'].drop_duplicates().to_list()


In [4]:
df_evaluation = df[df['custom_id'].isin(df_training_list)]
df_evaluation = df_evaluation.drop_duplicates(subset=['custom_id','name'])
df_evaluation['nr'] = df_evaluation.groupby('custom_id').cumcount() + 1



In [5]:
df_evaluation = df_evaluation.where(pd.notnull(df_evaluation), None)
df_training = df_training.where(pd.notnull(df_training), None)

In [6]:
df_evaluation.head(1)
df_evaluation[df_evaluation['custom_id']=="011816043546-53"]

Unnamed: 0,custom_id,name,url,abbreviation,nr
214,011816043546-53,International Federation of Freight Forwarders...,www.fiata.com,FIATA,1
215,011816043546-53,"European Association for Forwarding, Transport...",www.clecat.org,CLECAT,2


In [8]:
from fuzzywuzzy import fuzz
import pandas as pd

def preprocess_df(df):
    # Convert all string columns to lowercase if they are not null
    df = df.apply(lambda col: col.str.lower() if col.dtype == 'object' else col)
    return df

def compare_data(extracted_df, test_data, threshold=50):
    # Preprocess both dataframes to lowercase
    extracted_df = preprocess_df(extracted_df)
    test_data = preprocess_df(test_data)
    
    results = []
    
    for idx, row in extracted_df.iterrows():
        # Locate matching test_data row based on the ID and nr
        test_row = test_data[(test_data['id'] == row['custom_id']) & (test_data['nr'] == row['nr'])]
        
        if not test_row.empty:
            # Get the first matching row (if exists)
            test_row = test_row.iloc[0]
            
            # Fuzzy compare name, URL, abbreviation, accounting for None values
            name_match = (pd.isna(row['name']) and pd.isna(test_row['Network Name'])) or (
                pd.notna(row['name']) and pd.notna(test_row['Network Name']) and fuzz.ratio(row['name'], test_row['Network Name']) >= threshold
            )
            url_match = (pd.isna(row['url']) and pd.isna(test_row['URL'])) or (
                pd.notna(row['url']) and pd.notna(test_row['URL']) and fuzz.ratio(row['url'], test_row['URL']) >= threshold
            )
            abbrev_match = (pd.isna(row['abbreviation']) and pd.isna(test_row['Abbreviation'])) or (
                pd.notna(row['abbreviation']) and pd.notna(test_row['Abbreviation']) and fuzz.ratio(row['abbreviation'], test_row['Abbreviation']) >= threshold
            )

            # Append results to the list
            results.append({
                'id': row['custom_id'],
                'Extracted Name': row['name'],
                'Test Name': test_row['Network Name'],
                'Name Match': name_match,

                'Extracted URL': row['url'],
                'Test URL': test_row['URL'],
                'URL Match': url_match,

                'Extracted Abbreviation': row['abbreviation'],
                'Test Abbreviation': test_row['Abbreviation'],
                'Abbreviation Match': abbrev_match
            })
        else:
            # Handle case where no matching row is found in test_data
            results.append({
                'id': row['custom_id'],
                'Extracted Name': row['name'],
                'Test Name': None,
                'Name Match': False,

                'Extracted URL': row['url'],
                'Test URL': None,
                'URL Match': False,

                'Extracted Abbreviation': row['abbreviation'],
                'Test Abbreviation': None,
                'Abbreviation Match': False
            })
    
    # Convert results to DataFrame
    r = pd.DataFrame(results)
    return r



In [9]:
comparison_results = compare_data(df_evaluation, df_training)

In [10]:
# Calculate counts and percentages for each match type
for column in ['Name Match', 'URL Match', 'Abbreviation Match']:
    counts = comparison_results[column].value_counts()
    percentages = (counts / counts.sum()) * 100
    print(f"\n{column} Stats:")
    print(pd.DataFrame({'Count': counts, 'Percentage': percentages.round(2)}))


Name Match Stats:
            Count  Percentage
Name Match                   
True          145       77.54
False          42       22.46

URL Match Stats:
           Count  Percentage
URL Match                   
True         183       97.86
False          4        2.14

Abbreviation Match Stats:
                    Count  Percentage
Abbreviation Match                   
True                  146       78.07
False                  41       21.93


In [11]:
# Check for exact matches between 'Extracted Name' and 'Test Abbreviation' and update 'Name Match' if they match exactly
comparison_results['Name Match'] = comparison_results.apply(
    lambda row: True if row['Extracted Name'] == row['Test Abbreviation'] else row['Name Match'],
    axis=1
)

# Display the updated DataFrame

# Check for exact matches between 'Extracted Name' and 'Test Abbreviation' and update 'Name Match' if they match exactly
comparison_results['Abbreviation Match'] = comparison_results.apply(
    lambda row: True if row['Extracted Name'] == row['Test Abbreviation'] else row['Abbreviation Match'],
    axis=1
)

In [12]:
# Initialize an empty list to store the statistics
stats_data = []

# Calculate counts and percentages for each match type
for column in ['Name Match', 'URL Match', 'Abbreviation Match']:
    counts = comparison_results[column].value_counts()
    percentages = (counts / counts.sum()) * 100
    
    # Append the stats to the list as dictionaries
    for value, count in counts.items():
        stats_data.append({
            'Match Type': column,
            'Value': value,  # True or False
            'Count': count,
            'Percentage': round(percentages[value], 2)
        })

# Convert the list of dictionaries to a DataFrame
match_stats_df = pd.DataFrame(stats_data)

# Display the DataFrame
print(match_stats_df)
match_stats_df.to_clipboard()

           Match Type  Value  Count  Percentage
0          Name Match   True    169       90.37
1          Name Match  False     18        9.63
2           URL Match   True    183       97.86
3           URL Match  False      4        2.14
4  Abbreviation Match   True    165       88.24
5  Abbreviation Match  False     22       11.76


### remaining issues:

In [13]:
comparison_results[comparison_results['Name Match']==False]

Unnamed: 0,id,Extracted Name,Test Name,Name Match,Extracted URL,Test URL,URL Match,Extracted Abbreviation,Test Abbreviation,Abbreviation Match
20,266271142555-68,the u.s. taiwan business council,,False,,,True,,sia,False
35,689094544352-49,deutscher naturschutzring,,False,https://www.dnr.de/der-dnr/mitglieder/?l=404#s...,https://www.dnr.de/der-dnr/mitglieder/?l=404#s...,True,,dnr,False
37,689094544352-49,european environmental bureau,,False,https://eeb.org/membership/our-members/,https://eeb.org/membership/our-members/,True,eeb,eeb,True
53,359925647207-49,platformie good food good farming,koalicja żywa ziemia,False,,,True,,,True
56,56047191389-84,center for european policy,amcham eu,False,,,True,ceps,,False
57,56047191389-84,european policy center,britcham,False,,,True,epc,,False
58,56047191389-84,american european community association,,False,,,True,aeca,epaca,True
61,983289442547-36,efpia,eu-republic of korea civil society forum,False,https://www.efpia.eu/,,False,,,True
84,962348443419-15,asd,,False,,,True,asd,confindustria,False
127,7829357255-35,české podnikatelské reprezentace při eu,hospodářská komora české republiky,False,www.cebre.cz,,False,cebre,,False


### Evaluation Conculsion:
- the results of the extraction llm can still be improved in several ways. One issue regards the correct extraction and distinction of organisation names and abbreviations