In [None]:
#Installing libraries if necessary
!pip install thefuzz nameparser spacy
!python -m spacy download pt_core_news_sm

Collecting thefuzz
  Downloading thefuzz-0.22.1-py3-none-any.whl.metadata (3.9 kB)
Collecting nameparser
  Downloading nameparser-1.1.3-py2.py3-none-any.whl.metadata (6.1 kB)
Collecting rapidfuzz<4.0.0,>=3.0.0 (from thefuzz)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading thefuzz-0.22.1-py3-none-any.whl (8.2 kB)
Downloading nameparser-1.1.3-py2.py3-none-any.whl (24 kB)
Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nameparser, rapidfuzz, thefuzz
Successfully installed nameparser-1.1.3 rapidfuzz-3.10.1 thefuzz-0.22.1
Collecting pt-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.7.0/pt_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━

In [None]:
#Library for data manipulation and analysis
import pandas as pd

#Standard library for working with regular expressions
import re

#Importing the fuzz and process functions - this library is used to compare and match strings based on similarity
from thefuzz import fuzz, process

#Imports the HumanName class from the nameparser library
from nameparser import HumanName

#Natural language processing (NLP) library
import spacy

In [None]:
##Load the spaCy Portuguese language model
nlp = spacy.load("pt_core_news_sm")

In [None]:
#Mounting Google Drive to access files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Loading test data
df = pd.read_excel(
    "/content/drive/MyDrive/Colab Notebooks/collector_names.xls"
)

In [None]:
#Confirming the dimensions of the dataset
print(df.shape)

(184, 2)


In [None]:
#Checking the top 20
df.head(20)

Unnamed: 0,number,collector
0,1,A.Costa. G. Martinelli. Leitman. E.C. Dalcin. ...
1,2,André M. de Carvalho & G. Martinelli
2,3,g mARTINELLI
3,4,G.martinelli
4,5,G. martinelli
5,6,GMartinelli
6,7,G Martinelli
7,8,G .Martinelli
8,9,G . Martinelli
9,10,G Martinelli


In [None]:
#Function to create a mapping dictionary based on the desired format
def create_mapping_dictionary(
    dataframe, column, keyword, target_name, threshold=80
):
    """
    Creates a mapping dictionary to standardize collector names containing a specific keyword.
      :param dataframe: DataFrame containing the collector names.
      :param column: Name of the column containing the collector names.
      :param keyword: Keyword to identify the collector to be standardized (e.g., "Forzza").
      :param target_name: Desired standardized name (e.g., "R.C. Forzza").
      :param threshold: Minimum similarity to consider matches (default = 80).
      :return: Mapping dictionary.
    """
    unique_names = dataframe[column].dropna().unique()
    mapping_dict = {}
    for name in unique_names:
        # Extract the first collector
        collectors = re.split(r'\s+(?:e|&|;)\s+', name)
        first_collector = collectors[0].strip().lower()

        # Check if the keyword is in the first collector
        if keyword.lower() in first_collector:
            mapping_dict[name] = target_name
    return mapping_dict

In [None]:
#Function to standardize collector names
def standardize_collector_name(name, mapping, threshold = 80):
    """
    Standardizes the collector name using a mapping dictionary and fuzzy matching.
      :param name: Original collector name.
      :param mapping: Mapping dictionary for standardization.
      :param threshold: Minimum similarity to consider matches (default = 80).
      :return: Standardized collector name.
    """
    # Extract the first collector
    collectors = re.split(r'\s+(?:e|&|;)\s+', name)
    first_collector = collectors[0].strip()
    first_collector_lower = first_collector.lower()

    # Direct check in the mapping
    if first_collector_lower in mapping:
        return mapping[first_collector_lower]

    # If not found, use fuzzy matching
    best_match, score = process.extractOne(
        first_collector_lower,
        list(mapping.keys()),
        scorer=fuzz.token_sort_ratio
    )
    if score >= threshold:
        return mapping[best_match]
    else:
        # Fallback with nameparser
        parsed_name = HumanName(first_collector)
        return f"{parsed_name.first} {parsed_name.last}".strip()

In [None]:
#Example usage - specify the keyword to search for and the desired standardized name
#keyword = "Forzza"
#target_name = "R.C. Forzza"
keyword = "Martinelli"
target_name = "G. Martinelli"

In [None]:
#Create the mapping dictionary
mapping_dict = create_mapping_dictionary(df, 'collector', keyword, target_name)

In [None]:
#Invert the mapping dictionary for easier lookup
inverted_mapping = {var.lower(): standard for var, standard in mapping_dict.items()}

In [None]:
#Apply the standardization function to the DataFrame
df['standardized_name']=df['collector'].apply(lambda x: standardize_collector_name(x, inverted_mapping))

In [None]:
# Display the results
df

Unnamed: 0,number,collector,standardized_name
0,1,A.Costa. G. Martinelli. Leitman. E.C. Dalcin. ...,G. Martinelli
1,2,André M. de Carvalho & G. Martinelli,André de Carvalho
2,3,g mARTINELLI,G. Martinelli
3,4,G.martinelli,G. Martinelli
4,5,G. martinelli,G. Martinelli
...,...,...,...
179,180,R. Mello-Silva & R. C. Forzza,R. Mello-Silva
180,181,R. Mello-Silva & R.C. Forzza,R. Mello-Silva
181,182,R. Mello-Silva,R. Mello-Silva
182,183,"R. Mello-Silva, R. C. Forzza, L. G. Temponi,",R. R. Mello-Silva


In [None]:
df.to_csv("/content/drive/MyDrive/Colab Notebooks/padronizado.csv", index=False)