## Cleaning entity extractions from Transparency Register

#### Checking size, uniqueness etc.

In [1]:

import pandas as pd
import os
import sys

parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)

from utils import db_interaction

In [2]:
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.orm import declarative_base, sessionmaker
import logging

# Initialize the base class for declarative class definitions
Base = declarative_base()

# Set up the engine and session
engine = create_engine('sqlite:///../db/organisations.db')
Base.metadata.create_all(engine)  # Create the table if it does not exist

Session = sessionmaker(bind=engine)
session = Session()

### Examining Names & Abbreviations

In [3]:
import pandas as pd
from utils import db_interaction

# Query all records from the OrganisationModel table
results_all_entities = session.query(db_interaction.OrganisationModel).all()

# Convert the query result into a list of dictionaries
data = [
    {
        'id': entity.id,
        'custom_id': entity.custom_id,
        'name': entity.name,
        'url': entity.url,
        'abbreviation': entity.abbreviation
    }
    for entity in results_all_entities
]

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)


### Comparing with older and newer Transparency registers

In [14]:
print(f'there exists information for {len(df.custom_id.unique())} unique organisations from the transparency register ')
print(f'there are {len(df.name.unique())} unique names in the database')
print(f'there are {len(df.url.unique())} unique urls in the database')
print(f'there are {len(df.abbreviation.unique())} unique abbreviations in the database')

there exists information for 9744 unique organisations from the transparency register 
there are 23225 unique names in the database
there are 10159 unique urls in the database
there are 7131 unique abbreviations in the database


### Cleaning
**steps**
1. decide on cases where name and abbreviation is the same
- if multiple words in upper = name , else abbreviation ?
2. exclude entires which are the querried organisations
3. create identifier using fingerprint library ?
4. consolidate entires (how) ?

**exclude extracted entires which are the same as the name of the querried organisation**

In [49]:
path_full_data ="../datasets/inputs/organisations_in_transparency_register_jan_23.xlsx"

df_full = pd.read_excel(path_full_data,sheet_name='LIST_REGISTRED_ORGANISATION' )

df_full.rename(columns={'Is member of: List of associations, (con)federations, networks or other bodies of which the organisation is a member':'member_of',
                               'Identification code':'id' },
                        inplace=True)

In [79]:
df['id'] = df['id'].astype(str)
df_full['id'] = df_full['id'].astype(str)

# Perform the merge to get the 320 matching entries
matching_entries = pd.merge(df, df_full[['id', 'Name']], how='inner', left_on=['custom_id', 'name'], right_on=['id', 'Name'])

# Exclude these entries from df
df_excluded = df[~df[['custom_id', 'name']].apply(tuple, axis=1).isin(matching_entries[['custom_id', 'name']].apply(tuple, axis=1))]
df_excluded

Unnamed: 0,id,custom_id,name,url,abbreviation
0,1,875248845569-64,COMITÉ CONSULTIVO CC SUR,https://cc-sud.eu/index.php/es/,CC SUR
1,2,875248845569-64,ASOCIACIÓN DE ORGANIZACIONES DE PRODUCTORES DE...,,OPES Cantábrico
2,3,875248845569-64,CONSEJO CONSULTIVO PARA LAS ESPECIES PELÁGICAS,,PELAC
3,4,805341845171-02,DR4EU,https://dr4eu.org/,
4,5,513518246200-77,International Investment Funds Association,https://iifa.ca/page/members_poland,
...,...,...,...,...,...
39057,39058,855530342287-21,France Digitale,,
39058,39059,855530342287-21,HUB FRANCE IA,,
39059,39060,855530342287-21,Comité Stratégique de Filière des Industries d...,,France
39060,39061,855530342287-21,Sustainable Digital Infrastructure Alliance,,SDIA


**decide on abbreviation & name and convert small abbreviation to upper**

In [82]:

# Filter rows where name and abbreviation are the same and uppercase
#filtered_df_upper = df_excluded[(df_excluded['name'] == df_excluded['abbreviation']) & (df_excluded['abbreviation'].str.isupper())]

df_excluded.loc[(df_excluded['name'] == df_excluded['abbreviation']) & (df_excluded['abbreviation'].str.islower()), 'abbreviation'] = df_excluded['abbreviation'].str.upper()


In [88]:
# Define the function to classify the entries
# Define the function to classify the entries
def classify_name_abbreviation(row):
    # If either 'name' or 'abbreviation' is None, return None
    if row['name'] is None or row['abbreviation'] is None:
        return None
    
    # Check if abbreviation is specifically "BUSINESSEUROPE"
    if row['abbreviation'] == 'BUSINESSEUROPE':
        return 'name'
    
    # Check if name and abbreviation are the same
    if row['name'] == row['abbreviation']:
        # If there are spaces, classify as "name"
        if ' ' in row['abbreviation']:
            return 'name'
        else:
            return 'abbreviation'
    
    # If name and abbreviation are not the same, return None
    return None

# Apply the function to create the new 'classification' column
df_excluded['classification'] = df_excluded.apply(classify_name_abbreviation, axis=1)

# Display the DataFrame with the new classification column
df_excluded



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_excluded['classification'] = df_excluded.apply(classify_name_abbreviation, axis=1)


Unnamed: 0,id,custom_id,name,url,abbreviation,classification
0,1,875248845569-64,COMITÉ CONSULTIVO CC SUR,https://cc-sud.eu/index.php/es/,CC SUR,
1,2,875248845569-64,ASOCIACIÓN DE ORGANIZACIONES DE PRODUCTORES DE...,,OPES Cantábrico,
2,3,875248845569-64,CONSEJO CONSULTIVO PARA LAS ESPECIES PELÁGICAS,,PELAC,
3,4,805341845171-02,DR4EU,https://dr4eu.org/,,
4,5,513518246200-77,International Investment Funds Association,https://iifa.ca/page/members_poland,,
...,...,...,...,...,...,...
39057,39058,855530342287-21,France Digitale,,,
39058,39059,855530342287-21,HUB FRANCE IA,,,
39059,39060,855530342287-21,Comité Stratégique de Filière des Industries d...,,France,
39060,39061,855530342287-21,Sustainable Digital Infrastructure Alliance,,SDIA,


**generate fingerprints**

In [89]:
import fingerprints

def fp_func(text):
    fp = fingerprints.generate(text)
    return fp

In [92]:

df_excluded['fp_name'] = df_excluded.name.apply(fp_func)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_excluded['fp_name'] = df_excluded.name.apply(fp_func)


In [93]:
len(df_excluded['fp_name'].unique())

21458