In [18]:
import pandas as pd  # For handling data in DataFrames
import os  # For working with file paths
from difflib import get_close_matches  # For approximate string matching
from fuzzywuzzy import fuzz  # For fuzzy string matching
from fuzzywuzzy import process
import Levenshtein  # Optimized Levenshtein distance calculations (improves fuzzywuzzy performance)
import configparser  # For reading configurations from a .config file (future-proofing!)

In [19]:
# Load the data dictionaries
def load_excel(file_path, sheet_name=0):
    """Load an Excel file and return a DataFrame from the specified sheet."""
    return pd.read_excel(file_path, sheet_name=sheet_name)

# Load and standardize columns
def load_and_prepare_data(cde_file, study_file, cde_sheet=0, study_sheet=0):
    """Load and standardize columns from the CDE and study files."""
    # Load files with specified sheet names
    cde_data = load_excel(cde_file, sheet_name=cde_sheet)
    study_data = load_excel(study_file, sheet_name=study_sheet)

    # Standardize column names
    cde_cols = {
        "CRF Name": "crf_name",
        "CDE Name": "cde_name",
        "Variable Name": "variable_name",
        "Definition": "definition",
        "Permissible Values": "permissible_values",
        "PV Description": "pv_description",
        "Data Type": "data_type"
    }

    study_cols = {
        "Original CRF Name": "crf_name",
        "Variable Name": "variable_name",
        "Extracted CRF Name": "normalized_crf_name", # does not change; column name derived from part 1 of script
        "Matched HEAL Core CRF": "core_heal_match_by_ai", # does not change; column name derived from part 1 of script
        "Definition": "definition",
        "Permissible Values": "permissible_values",
        "PV Description": "pv_description",
        "Data Type": "data_type"
    }

    # Rename columns to standardized names
    cde_data.rename(columns=cde_cols, inplace=True)
    study_data.rename(columns=study_cols, inplace=True)

    # Convert relevant columns to lowercase for consistency
    for col in ["variable_name", "crf_name"]:
        cde_data[col] = cde_data[col].str.lower()
        study_data[col] = study_data[col].str.lower()

    print("Renamed CDE Data Columns:", cde_data.columns)
    print("Renamed Study Data Columns:", study_data.columns)

    return cde_data, study_data

In [20]:
def find_exact_and_crf_matches(cde_data, study_data):
    """Find exact matches and CRF mismatches, considering fuzzy matches for CRF names."""
    results = []

    for _, study_row in study_data.iterrows():
        study_var = study_row["variable_name"]
        study_crf = study_row["crf_name"]

        # Step 1: Find exact matches for variable and CRF name
        exact_matches = cde_data[
            (cde_data["variable_name"] == study_var) &
            (cde_data["crf_name"] == study_crf)
        ]
        if not exact_matches.empty:
            results.append("Exact Match")
            continue

        # Step 2: Check for exact variable name match but fuzzy CRF match
        var_matches = cde_data[cde_data["variable_name"] == study_var]
        if not var_matches.empty:
            # Fuzzy match CRF names
            cde_crf = var_matches.iloc[0]["crf_name"]  # Take the first matching CRF
            crf_score = fuzz.ratio(study_crf.lower(), cde_crf.lower())
            if crf_score >= 70:  # Threshold for a "close" CRF match
                results.append("Wrong CRF (via Fuzzy Match)")
            else:
                results.append("Wrong CRF")
            continue

        # Step 3: Look for fuzzy matches for variable name
        close_matches = get_close_matches(study_var, cde_data["variable_name"].tolist(), n=1, cutoff=0.8)
        if close_matches:
            close_match_var = close_matches[0]

            # Check if the closest match has a fuzzy CRF match
            close_match_crf = cde_data[cde_data["variable_name"] == close_match_var]["crf_name"].iloc[0]
            crf_score = fuzz.ratio(study_crf.lower(), close_match_crf.lower())
            if crf_score >= 70:  # Adjust threshold as needed
                results.append("Wrong CRF (via Close Match and Fuzzy CRF)")
            else:
                results.append("Not in CDE")
        else:
            # Step 4: No match found at all
            results.append("Not in CDE")

    return results

## **Function: `find_exact_and_crf_matches()`**
This function compares **study variables** against the HEAL CDE list to determine the **degree of matching** based on **variable name and CRF name**.

### **🔍 How It Works**
- **Checks for exact matches first** (both `variable_name` and `crf_name` must match).  
- **If no exact match is found**, it looks for **partial matches** based on `variable_name`.  
- **If a variable name is close but not exact**, it uses **fuzzy matching** to compare `crf_name`.  
- **If no match is found at all**, the variable is labeled as **"Not in CDE"**.

---

### **📌 Matching Logic & Classification**
1️⃣ **Exact Match** → Both `variable_name` and `crf_name` match exactly.  
   - ✅ Filters `cde_data` for rows where both values match.
   - ✅ If a match is found, it's classified as `"Exact Match"`, and the function moves to the next variable.

2️⃣ **Partial Match** → The `variable_name` matches, but the `crf_name` does not.  
   - 🔄 Uses **fuzzy matching (`fuzz.ratio()`)** to compare `crf_name`.  
   - 🔄 If the **similarity score (`crf_score`) ≥ 70**, it's classified as **"Wrong CRF (via Fuzzy Match)"**.  
   - ❌ Otherwise, it's labeled as **"Wrong CRF"** (meaning the CRF is completely different).  

3️⃣ **Fuzzy Variable Name Match** → The `variable_name` does not match exactly but is close.  
   - 🔍 Uses **`get_close_matches()`** to find the closest `variable_name` from `cde_data`.  
   - 🔄 If a close match exists, it **checks `crf_name` with fuzzy logic**.  
     - If **`crf_score` ≥ 70**, it's labeled **"Wrong CRF (via Close Match and Fuzzy CRF)"**.  
     - ❌ Otherwise, it's labeled **"Not in CDE"** (meaning it’s similar but doesn’t belong to the expected CRF).  

4️⃣ **No Match** → Neither `variable_name` nor `crf_name` is found in the CDE knowledge base.  

---

### **📝 Possible Responses & What They Mean**
| **Response** | **Meaning** |
|-------------|------------|
| **`"Exact Match"`** | ✅ Both `variable_name` and `crf_name` match exactly. |
| **`"Wrong CRF"`** | ❌ `variable_name` matches, but `crf_name` does not. |
| **`"Wrong CRF (via Fuzzy Match)"`** | 🔄 `variable_name` matches, and `crf_name` is similar but not identical (`≥70%` similarity). |
| **`"Wrong CRF (via Close Match and Fuzzy CRF)"`** | 🔄 The closest matching `variable_name` was found, but `crf_name` is still mismatched (via fuzzy logic). |
| **`"Not in CDE"`** | ❌ No match found at all in the HEAL CDE knowledge base. |

---

### **💡 Why This Matters**
- Ensures **both variable name and CRF alignment** before confirming a match.  
- Identifies **possible misclassified CRF names**, which can cause inconsistencies.  
- Helps **track variable discrepancies** that may need further manual validation.  

---


In [21]:
def find_close_matches(cde_data, study_data, exact_results):
    """Find close matches for variable names and fuzzy matches for CRF names."""
    close_match_results = []

    for i, study_row in study_data.iterrows():
        study_var = study_row["variable_name"]
        study_crf = study_row["crf_name"]

        # Skip if already an exact match
        if exact_results[i] == "Exact Match":
            close_match_results.append(None)
            continue

        # Find close matches for variable name
        close_matches = get_close_matches(study_var, cde_data["variable_name"].tolist(), n=1, cutoff=0.8)
        if close_matches:
            close_match_var = close_matches[0]

            # Fuzzy match CRF names
            close_match_crf = cde_data[cde_data["variable_name"] == close_match_var]["crf_name"].iloc[0]
            crf_score = fuzz.ratio(study_crf.lower(), close_match_crf.lower())
            if crf_score >= 70:
                close_match_results.append(f"Close Match: {close_match_var} (Fuzzy CRF Match)")
            else:
                close_match_results.append(f"Close Match: {close_match_var} (CRF Mismatch)")
        else:
            # Add fallback message when no close matches are found
            close_match_results.append("No Close Match Found")

    return close_match_results


## **Function: `find_close_matches()`**
This function builds on the previous function by focusing on **approximate (fuzzy) matches** for variable names and CRF names.  
It refines the matching process by **skipping already confirmed exact matches** and adding **more nuanced categories** for near matches.

### **🔍 How It Works**
- **If an exact match was already found**, it is **skipped**.
- **Uses fuzzy matching** to determine close variable name matches.
- **Checks CRF name similarity** (fuzzy score threshold = 70).
- **Returns a ranked match classification** for each variable in the study dataset.

---

### **📌 Ranking System for Matches**
1️⃣ **Exact Match** (Handled by the previous function, `find_exact_and_crf_matches()`).  
2️⃣ **Close Match with Fuzzy CRF Match** (if CRF similarity **≥ 70**).  
3️⃣ **Close Match with CRF Mismatch** (if CRF similarity **< 70**).  
4️⃣ **No Close Match Found** (if no variable match is found).  

---

### **📝 Possible Responses & What They Mean**
| **Response** | **Meaning** |
|-------------|------------|
| **`None`** | ✅ This study variable was already an exact match, so it was skipped. |
| **`"Close Match: {close_match_var} (Fuzzy CRF Match)"`** | 🔄 A close variable name was found, and the CRF name is a fuzzy match. |
| **`"Close Match: {close_match_var} (CRF Mismatch)"`** | ❌ A close variable name was found, but the CRF name does not match. |
| **`"No Close Match Found"`** | ❌ No sufficiently close variable names were found. |

---

### **💡 Why This Matters**
- Helps **identify variables that are close but not exact**, guiding researchers on potential discrepancies.  
- Highlights **CRF mismatches**, which might indicate **misclassification or inconsistent naming conventions**.  
- Provides a **systematic way to categorize near-matches** for further review.  

---

In [22]:
def find_data_type_mismatches(cde_data, study_data):
    """Check for data type mismatches between the study file and the HEAL CDE list."""
    mismatch_results = []

    for _, study_row in study_data.iterrows():
        study_var = study_row["variable_name"]
        study_type = str(study_row.get("data_type", "")).strip().lower() # Normalize text

        # Find exact variable match in CDE list
        cde_match = cde_data[cde_data["variable_name"] == study_var]

        if cde_match.empty:
            mismatch_results.append("No Comparison Possible")  # Variable not in CDE
            continue

        # Get expected data type from CDE
        expected_type = str(cde_match.iloc[0]["data_type"]).strip().lower()  # Normalize text

        # Compare data types
        if study_type != expected_type:
            mismatch_results.append(f"Data Type Mismatch: Expected '{expected_type}', Found '{study_type}'")
        else:
            mismatch_results.append("Match")

    return mismatch_results

## **Function: `find_data_type_mismatches()`**
This function compares the **data type** of each study variable against the HEAL CDE list to identify mismatches, ensuring data consistency.

### **📌 Possible Responses**
| **Response** | **Meaning** |
|-------------|------------|
| **`"Match"`** | ✅ The study variable **has the same data type** as in the HEAL CDE list. |
| **`"Data Type Mismatch: Expected '<expected_type>', Found '<study_type>'"`** | ⚠️ The study variable **has a different data type** than what's in the HEAL CDE list. |
| **`"No Comparison Possible"`** | ❓ The study variable **does not exist in the CDE list**, so a data type comparison cannot be performed. |

### **📝 Example Interpretations**
- **`"Match"`** → Everything is good! No data type issues. ✅  
- **`"Data Type Mismatch: Expected 'radio', Found 'text'"`** → The study team **used 'text' instead of 'radio'**, which could lead to inconsistencies. ⚠️  
- **`"No Comparison Possible"`** → The variable **was not found in the HEAL CDE list**, so no data type check was possible.  

---


In [23]:
def normalize_text(text):
    """Normalize text by making it lowercase, stripping spaces, and removing extra whitespace."""
    if not isinstance(text, str):  # Handle NaN or None
        return ""
    return " ".join(text.lower().strip().split())

def find_pv_and_encoding_mismatches(cde_data, study_data):
    """Find mismatches in permissible values and encodings, even for unmatched variables."""
    mismatch_results = []

    for _, study_row in study_data.iterrows():
        study_var = study_row["variable_name"]
        study_pv = normalize_text(study_row.get("permissible_values", ""))
        study_desc = normalize_text(study_row.get("pv_description", ""))

        # Step 1: Check for exact variable name matches
        matches = cde_data[cde_data["variable_name"] == study_var]
        if matches.empty:
            # No match found for variable_name, so no PV/Encoding comparison is possible
            mismatch_results.append("No Comparison Possible")
            continue

        # Step 2: Handle missing PV/Encoding values in the study data
        if not study_pv and not study_desc:
            mismatch_results.append("Both PV/Encoding missing")
            continue

        # Step 3: Compare PV and encoding for exact matches
        pv_mismatch = False
        encoding_mismatch = False

        for _, cde_row in matches.iterrows():
            cde_pv = normalize_text(cde_row.get("permissible_values", ""))
            cde_desc = normalize_text(cde_row.get("pv_description", ""))
            pv_mismatch |= study_pv != cde_pv
            encoding_mismatch |= study_desc != cde_desc

        if pv_mismatch or encoding_mismatch:
            mismatch_results.append(
                f"PV Mismatch: {pv_mismatch}, Encoding Mismatch: {encoding_mismatch}"
            )
            continue

        # Step 4: Fuzzy match based on PV/Encoding if no exact match was conclusive
        best_match = None
        best_score = 0

        for _, cde_row in cde_data.iterrows():
            cde_pv = normalize_text(cde_row.get("permissible_values", ""))
            cde_desc = normalize_text(cde_row.get("pv_description", ""))

            # Skip NaN/empty values for matching
            if not study_pv and not cde_pv and not study_desc and not cde_desc:
                continue

            pv_score = fuzz.ratio(study_pv, cde_pv)
            desc_score = fuzz.ratio(study_desc, cde_desc)
            avg_score = (pv_score + desc_score) / 2

            if avg_score > best_score and avg_score >= 70:  # Adjust threshold as needed
                best_match = cde_row["variable_name"]
                best_score = avg_score

        if best_match:
            mismatch_results.append(
                f"Close PV/Encoding Match: {best_match} (Score: {best_score:.1f})"
            )
        else:
            mismatch_results.append(None)

    return mismatch_results

## **Function: `find_pv_and_encoding_mismatches()`**
This function checks for **permissible value (PV) mismatches** and **encoding description mismatches** between study variables and the HEAL CDE list.

### **🔍 How It Works**
- **PV Mismatch** → Occurs when the permissible values **do not match exactly**.  
- **Encoding Mismatch** → Happens when the **descriptions are meaningfully different**, not just worded slightly differently.  
- **Uses fuzzy similarity** to determine if an encoding description is a close match.  
- **If no variable match exists**, PV/Encoding cannot be compared.  

---

### **📝 Possible Responses & What They Mean**
| **Response** | **Meaning** |
|-------------|------------|
| **`"No Comparison Possible"`** | ❌ No variable name match, so PV/Encoding cannot be checked. |
| **`"Both PV/Encoding missing"`** | ⚠️ The study did not provide PVs or encoding descriptions, so no comparison is possible. |
| **`"PV Mismatch: True, Encoding Mismatch: False"`** | ❌ Permissible values (PVs) do not match, but encoding descriptions are the same. |
| **`"PV Mismatch: False, Encoding Mismatch: True"`** | ❌ PVs match, but encoding descriptions are different. |
| **`"PV Mismatch: True, Encoding Mismatch: True"`** | 🚨 ❌ Both PVs and encoding descriptions do not match. |
| **`"Close PV/Encoding Match: {best_match} (Score: {score})"`** | 🔄 A close match was found (based on fuzzy similarity). |
| **`None`** | ❌ No meaningful PV/Encoding match was found. |

---

### **💡 Why This Matters**
- Helps identify **potentially incorrect PVs or inconsistent encoding descriptions** that need standardization.  
- Ensures **study variables align with HEAL CDE expectations**, improving interoperability.  
- Highlights **variables where no comparison is possible**, prompting further investigation.  

---


In [24]:
def find_best_close_match(study_data, cde_data):
    """Find the best matching CDE variable name for study variables that do not have an exact match."""
    best_matches = []

    for study_var in study_data["variable_name"]:
        # Skip if the variable already had an exact match
        if study_var in cde_data["variable_name"].values:
            best_matches.append(None)
            continue
        
        # Find the best match using fuzzy matching
        match_result = process.extractOne(study_var, cde_data["variable_name"].tolist(), score_cutoff=70)

        if match_result is not None:  # Ensure we got a valid result before unpacking
            match, score = match_result  # Unpack safely
            # Get the actual CDE name corresponding to the best-matched variable
            best_cde_name = cde_data.loc[cde_data["variable_name"] == match, "cde_name"].values
            best_matches.append(best_cde_name[0] if len(best_cde_name) > 0 else None)
        else:
            best_matches.append(None)  # No match found, assign None

    return best_matches

In [25]:
# Save report function
def save_report(report, output_file):
    """Save the report to a CSV file."""
    report.to_csv(output_file, index=False)
    file_path = os.path.abspath(output_file)
    print(f"Comparison report saved to: {file_path}")

In [26]:
# File paths and sheet names
cde_file_path = r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\KnowledgeBase\Compiled_CORE_CDEs list_English_one sheet_as of 2025-01-28.xlsx"
study_file_path = r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\ValidatedCDEuse\DD_HDP01011_Matthias.xlsx"
cde_sheet_name = "ALL"
study_sheet_name = "EnhancedDD"

# Output file path
output_file_path = r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\ValidatedCDEuse\DD_HDP01011_Matthias_variablevalidation.csv"

# Load data
cde_data, study_data = load_and_prepare_data(
    cde_file_path,
    study_file_path,
    cde_sheet_name,
    study_sheet_name
)

# Ensure all `variable_name` values are strings
cde_data["variable_name"] = cde_data["variable_name"].astype(str)
study_data["variable_name"] = study_data["variable_name"].astype(str)

# Generate report columns
exact_results = find_exact_and_crf_matches(cde_data, study_data)
close_match_results = find_close_matches(cde_data, study_data, exact_results)
mismatch_results = find_pv_and_encoding_mismatches(cde_data, study_data)
data_type_mismatch_results = find_data_type_mismatches(cde_data, study_data)

# INSERT find_best_close_match HERE ⬇️
best_close_matches = find_best_close_match(study_data, cde_data)

# Debugging prints to check list lengths
print("Exact Match Results Length:", len(exact_results))
print("Close Match Results Length:", len(close_match_results))
print("Mismatch Results Length:", len(mismatch_results))
print("Best Close Matches Length:", len(best_close_matches))  # New debugging check
print("Study Data Length:", len(study_data))
print("CDE Data Length:", len(cde_data))

# Create the report DataFrame with enhanced checks
report = pd.DataFrame({
    "Study Variable": study_data["variable_name"],
    "Study Form Name": study_data["crf_name"],
    "Extracted CRF Name": study_data["normalized_crf_name"],
    "Matched HEAL Core CRF": study_data["core_heal_match_by_ai"],
    "Exact/CRF Match Result": exact_results,

    # Prevent IndexError when looking up exact match CDE names
    "Exact Match CDE Name": [
        cde_data.loc[cde_data["variable_name"] == study_var, "cde_name"].values[0]
        if result == "Exact Match" and not cde_data.loc[cde_data["variable_name"] == study_var, "cde_name"].empty else None
        for study_var, result in zip(study_data["variable_name"], exact_results)
    ],

    "Close Match Result": close_match_results,

    # Prevent IndexError when extracting Best Close Match CDE Name
    "Best Close Match CDE Name": [
        res.split(": ")[1] if res and "Close Match" in res and len(res.split(": ")) > 1 else None
        for res in close_match_results
    ],

    "Best Close Match CDE Name": best_close_matches,

    "Data Type Result": data_type_mismatch_results,

    "PV/Encoding Result": mismatch_results,

    # Prevent IndexError when extracting PV Similarity Score
    "PV Similarity Score": [
        float(res.split("Score: ")[1][:-1]) if res and "Score:" in res and len(res.split("Score: ")) > 1 else None
        for res in mismatch_results
    ],
})

# Function to classify match types
def determine_final_match(row):
    if row["Exact/CRF Match Result"] == "Exact Match":
        return "Exact"
    elif row["Best Close Match CDE Name"] is not None:  # Use the actual best close match instead of Close Match Result
        return "Close"
    elif "Mismatch" in str(row["PV/Encoding Result"]) or "Mismatch" in str(row["Data Type Result"]):
        return "Mismatch"
    else:
        return "No Match"  # Ensures unmatched variables are correctly labeled

# Apply the function to create the "Final Match Type" column
report["Final Match Type"] = report.apply(determine_final_match, axis=1)

# Save the report
save_report(report, output_file_path)


Renamed CDE Data Columns: Index(['Study Population Focus', 'Domain', 'crf_name', 'CRF Question #',
       'cde_name', 'variable_name', 'definition', 'Short Description',
       'Additional Notes (Question Text)', 'permissible_values',
       'pv_description', 'data_type', 'Disease Specific Instructions',
       'Disease Specific References', 'Population', 'Classification',
       'External Id CDISC', 'CDISC Permissible Values', 'CDISC Data Type',
       'CDISC Notes', 'Additional Information',
       'Map to CDISC variable name if different', 'Map to CDISC format',
       'Notes', 'Unnamed: 24'],
      dtype='object')
Renamed Study Data Columns: Index(['CDE Name', 'normalized_crf_name', 'core_heal_match_by_ai',
       'Match Confidence', 'Manual Validation', 'variable_name', 'definition',
       'Short Description', 'Question Text', 'permissible_values',
       'pv_description', 'data_type', 'Disease Specific Instructions',
       'Disease Specific References', 'Population', 'Classific