In [1]:
import pandas as pd
import re
from Levenshtein import distance as lev_distance

In [2]:
def normalize_string(s):
    """Normalize string by converting to lowercase, handling NaN or float, and removing non-alphanumeric characters."""
    if isinstance(s, str):  # Only process strings
        return re.sub(r'\W+', '', s.lower())
    else:
        return ''  # Or you can return 'Missing Data' if you'd prefer

### String Normalization Utility

This script sets up basic tools for data cleaning:
- **pandas** for handling datasets
- **re** for regular expression-based text cleaning
- **Levenshtein distance** for fuzzy string comparison

It includes a helper function `normalize_string()` that:
- Lowercases text
- Removes non-alphanumeric characters
- Handles missing values gracefully


In [3]:
def levenshtein_to_percentage(str1, str2):
    """
    Convert Levenshtein distance to a percentage similarity score.
    100% means an exact match, 0% means completely different.
    """
    lev_dist = lev_distance(str1, str2)
    max_len = max(len(str1), len(str2))
    return (1 - lev_dist / max_len) * 100

### Levenshtein Distance to Percentage Similarity

This helper function `levenshtein_to_percentage()` compares two strings and returns a **similarity score** expressed as a percentage:
- **100%** → identical strings
- **0%** → completely different strings

It calculates the number of edits needed to transform one string into the other (using Levenshtein distance) and then scales that difference relative to the length of the longer string.

Example:
- `"PainScore"` vs. `"Pain_Score"` ➔ 90% similar
- `"Pain"` vs. `"Pain"` ➔ 100% similar
- `"Pain"` vs. `"Gain"` ➔ 75% similar


In [4]:
def compare_encodings(study_file, encoding_column='encodings', field_label_column='field_label', cde_file='./KnowledgeBase/Compiled_CORE_CDEs list_English_one sheet_as of 2025-01-28.xlsx', study_sheet='Sheet1'):
    """
    Compare study data dictionary encodings and field labels with HEAL CDE encodings using Levenshtein distance.
    
    Parameters:
    - study_file: Path to the study data dictionary file (.xlsx or .csv).
    - encoding_column: Name of the column containing encodings in the study file.
    - field_label_column: Name of the column containing field labels (questions) in the study file.
    - cde_file: Path to the HEAL CDE knowledge base file (.xlsx).
    - study_sheet: Name of the sheet in the study file to process (default is 'Sheet1').
    
    Returns:
    - DataFrame: Original study data with match results.
    """

    # Load study data from the specified sheet
    if study_file.endswith('.xlsx'):
        study_df = pd.read_excel(study_file, sheet_name=study_sheet)
    else:
        study_df = pd.read_csv(study_file)
    
    # Initialize new columns for matches
    study_df['Best Match CDE Name'] = None
    study_df['Best Match Score'] = None
    study_df['Best Match CRF Name'] = None
        
    study_df['Potential Match 2 - CDE Name'] = None
    study_df['Potential Match 2 - Score'] = None
    study_df['Potential Match 2 - CRF Name'] = None

    study_df['Potential Match 3 - CDE Name'] = None
    study_df['Potential Match 3 - Score'] = None
    study_df['Potential Match 3 - CRF Name'] = None

    # Skip rows where either encoding or field label is blank or NaN
    study_df['Normalized Combined'] = study_df.apply(
        lambda row: normalize_string(row[encoding_column] + " | " + row[field_label_column]) 
        if pd.notna(row[encoding_column]) and pd.notna(row[field_label_column]) else '', axis=1
    )

    # Load HEAL CDE encodings and field labels
    cde_df = pd.read_excel(cde_file, sheet_name='ALL')
    cde_df = cde_df.dropna(subset=['PV Description', 'Additional Notes (Question Text)'])
    
    # Concatenate field label and encoding for HEAL CDEs
    cde_df['Normalized Combined'] = cde_df['Additional Notes (Question Text)'].fillna('') + " | " + cde_df['PV Description'].fillna('')
    cde_df['Normalized Combined'] = cde_df['Normalized Combined'].apply(normalize_string)
    
    # Compare each study combined field label and encoding with CDE encodings
    for idx, row in study_df.iterrows():
        # Skip row if 'Normalized Combined' is empty (i.e., either encoding or field label was blank or NaN)
        if row['Normalized Combined'] == '':
            continue  # Skip this row and move to the next one
        
        best_match = None
        best_score = 0
        best_crf_name = None

        potential_matches = []
        
        for _, cde_row in cde_df.iterrows():
            # Calculate the Levenshtein similarity score as a percentage
            score_percentage = levenshtein_to_percentage(row['Normalized Combined'], cde_row['Normalized Combined'])
            
            # Find the best match (highest percentage)
            if score_percentage > best_score:
                best_match = cde_row['Variable Name']
                best_score = score_percentage
                best_crf_name = cde_row['CRF Name']
            # Store all potential matches (we will sort later)
            potential_matches.append((cde_row['Variable Name'], score_percentage, cde_row['CRF Name']))
        
        # Sort potential matches by score percentage in descending order and take the top 3
        potential_matches.sort(key=lambda x: x[1], reverse=True)  # Sort by score (index 1)
        
        # Remove duplicates from the potential matches list by creating a set of unique CDE names
        unique_matches = []
        seen = set()
        for name, score, crf in potential_matches:
            if name not in seen:
                unique_matches.append((name, score, crf))
                seen.add(name)
        potential_matches = unique_matches
        
        # Assign best match and CRF Name
        if best_match:
            study_df.at[idx, 'Best Match CDE Name'] = best_match
            study_df.at[idx, 'Best Match Score'] = best_score
            study_df.at[idx, 'Best Match CRF Name'] = best_crf_name  
        
        # Now the top 3 unique potential matches
        for i, (match_name, match_score, crf_name) in enumerate(potential_matches[:3], start=2):
            study_df.at[idx, f'Potential Match {i} - CDE Name'] = match_name
            study_df.at[idx, f'Potential Match {i} - Score'] = match_score
            study_df.at[idx, f'Potential Match {i} - CRF Name'] = crf_name
    
    # Save the results
    output_file = f"{study_file.rsplit('.', 1)[0]}_vlmd_cdesearch.xlsx"
    study_df.to_excel(output_file, index=False)
    print(f"Comparison complete. Results saved to {output_file}.")
    return study_df


### Compare Study Encodings with HEAL CDEs

This function `compare_encodings()` automates the comparison between a study's variable list and the HEAL Common Data Elements (CDEs).

It:
- Cleans and normalizes encoding + question text.
- Uses Levenshtein distance to find the most similar HEAL CDE for each study variable.
- Captures the best match and two alternative potential matches.
- Saves results to a new Excel file with the best CDE recommendations.

In [None]:
if __name__ == "__main__":
    study_file = './in/SAMPLE_sprint_2020-12-16.xlsx'  # Path to the study file inside the 'in' folder
    study_sheet = 'SheetA'  # Replace with the name of the sheet you want to process
    encoding_column = 'Choices, Calculations, OR Slider Labels'  # Replace with the correct column name for the PV Description
    field_label_column = 'Field Label'  # Replace with the correct column name for the field labels in the study data dictionary
    compare_encodings(
        study_file,
        encoding_column=encoding_column,
        field_label_column=field_label_column,
        study_sheet=study_sheet
    )

Comparison complete. Results saved to ./in/SAMPLE_sprint_2020-12-16_vlmd_cdesearch.xlsx.


### Example Usage

The following block shows an example of how to use the `compare_encodings()` function:

- **study_file**: Path to the Excel or CSV file containing the study's data dictionary.
- **study_sheet**: The name of the sheet inside the Excel file (only used if input is .xlsx).
- **encoding_column**: The column in the study file where the permissible values (encodings) are located.
- **field_label_column**: The column in the study file where the field labels (question text) are located.

Running the script will output a new Excel file listing the best CDE matches and potential alternatives.