In [1]:
import pandas as pd
import re
import os

In [2]:
# Update of multiple hcs file

import pandas as pd
import os

# Ask user if they have multiple HCS files
multiple_files = input("Do you have multiple HCS files to process? (yes/no): ").lower().strip()

all_hcs_data = []

def process_file(file_path):
    """Process a single file (CSV, XLS, or HTML) and return a DataFrame."""
    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        elif file_path.endswith(('.xls', '.xlsx')):
            df = pd.read_excel(file_path)
        elif file_path.endswith('.html'):
            with open(file_path, 'r', encoding='utf-8') as file:
                html_content = file.read()
            df_list = pd.read_html(html_content)
            df = df_list[0]
            # Set first row as column names for HTML files
            df.columns = df.iloc[0]
            df = df[1:].reset_index(drop=True)
        else:
            raise ValueError("Unsupported file format. Use CSV, XLS/XLSX, or HTML.")
        return df
    except Exception as e:
        print(f"⚠️ Error processing {file_path}: {str(e)}")
        return None

if multiple_files == 'yes':
    # Ask for multiple file paths
    hcs_paths = []
    while True:
        path = input("Please enter the path to an HCS file (html/xls/csv) (or 'done' to finish): ")
        if path.lower() == 'done':
            break
        if os.path.exists(path):
            hcs_paths.append(path)
        else:
            print("Error: The file does not exist. Please check the path and try again.")

    if not hcs_paths:
        print("Error: No valid files provided. Exiting.")
    else:
        # Process each HCS file
        for hcs_path in hcs_paths:
            hcs = process_file(hcs_path)
            if hcs is not None:
                all_hcs_data.append(hcs)
                print(f"✅ Successfully processed: {hcs_path}")

        # Combine all dataframes
        if all_hcs_data:
            combined_hcs = pd.concat(all_hcs_data, ignore_index=True)
            print(f"\n✅ Combined {len(all_hcs_data)} HCS files")
            print("First few rows of combined data:")
            print(combined_hcs.head())
            # Option to save
            save = input("Would you like to save the combined data to a CSV file? (yes/no): ").lower().strip()
            if save == 'yes':
                output_path = input("Enter the output file path (e.g., output.csv): ")
                combined_hcs.to_csv(output_path, index=False)
                print(f"✅ Saved to {output_path}")
        else:
            print("No data was successfully processed.")

else:
    # Single file processing
    hcs_path = input("Please enter the path to your HCS file (html/xls/csv): ")
    if not os.path.exists(hcs_path):
        print("Error: The file does not exist. Please check the path and try again.")
    else:
        hcs = process_file(hcs_path)
        if hcs is not None:
            print(f"\n✅ Successfully processed: {hcs_path}")
            print("First few rows of data:")
            print(hcs.head())
            print(f"Total rows: {len(hcs)}")
            # Option to save
            save = input("Would you like to save the data to a CSV file? (yes/no): ").lower().strip()
            if save == 'yes':
                output_path = input("Enter the output file path (e.g., output.csv): ")
                hcs.to_csv(output_path, index=False)
                print(f"✅ Saved to {output_path}")

Do you have multiple HCS files to process? (yes/no): yes
Please enter the path to an HCS file (html/xls/csv) (or 'done' to finish): today2.xlsx
Please enter the path to an HCS file (html/xls/csv) (or 'done' to finish): done
✅ Successfully processed: today2.xlsx

✅ Combined 1 HCS files
First few rows of combined data:
   ISSTYPE       CARD_NUMBER             CRDH_NAME         ATM_ACCT  \
0  REPLACE  4753960000746392           IJAJ HUSSEN  414701406417018   
1  REPLACE  4753960000742094  TEJLAXMI RAJBHANDARI  210200053905015   
2  REPLACE  4753960000742102  PURNA BAHADUR KHADGI  210201108720016   
3  REPLACE  4753960000742193    SUKRA P. RANJITKAR  210001312381019   
4  REPLACE  4753960000742219        BISHNU B SINGH  726105035139015   

     ISS_DATE  EXPIR_DATE  CARD_ID  
0  2025-03-24  2029-03-24  1308014  
1  2025-03-24  2029-03-24  1306670  
2  2025-03-24  2029-03-24  1306671  
3  2025-03-24  2029-03-24  1306678  
4  2025-03-24  2029-03-24  1306680  
Would you like to save the combi

In [2]:
import os
import re
import pandas as pd

def normalize_name(name):
    """Normalize a name by converting to uppercase, removing extra spaces, and splitting into components."""
    if pd.isna(name) or name == "NaN":
        return {"first": "", "middle": "", "surname": ""}
    # Remove extra spaces and convert to uppercase
    name = " ".join(name.split()).upper()
    parts = name.split()
    
    if len(parts) == 1:
        return {"first": parts[0], "middle": "", "surname": ""}
    elif len(parts) == 2:
        return {"first": parts[0], "middle": "", "surname": parts[1]}
    else:
        return {"first": parts[0], "middle": " ".join(parts[1:-1]), "surname": parts[-1]}

def process_hcs_file(file_path):
    """Process an HCS file (HTML, CSV, or XLS/XLSX) and return a DataFrame."""
    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        elif file_path.endswith(('.xls', '.xlsx')):
            df = pd.read_excel(file_path)
        elif file_path.endswith('.html'):
            with open(file_path, 'r', encoding='utf-8') as file:
                html_content = file.read()
            df_list = pd.read_html(html_content)
            df = df_list[0]
            df.columns = df.iloc[0]
            df = df[1:].reset_index(drop=True)
        else:
            raise ValueError("Unsupported file format. Use CSV, XLS/XLSX, or HTML.")
        return df
    except Exception as e:
        print(f"⚠️ Error processing {file_path}: {str(e)}")
        return None

# --- Process Extracted Names ---
multiple_txt_files = input("Do you have multiple text files for name extraction? (yes/no): ").lower().strip()
all_extracted_names = []

if multiple_txt_files == 'yes':
    txt_paths = []
    while True:
        path = input("Please enter the path to a text file (or 'done' to finish): ")
        if path.lower() == 'done':
            break
        if os.path.exists(path):
            txt_paths.append(path)
        else:
            print("Error: File does not exist. Please check the path and try again.")

    if not txt_paths:
        print("Error: No valid text files provided. Exiting.")
    else:
        for txt_path in txt_paths:
            try:
                with open(txt_path, 'r', encoding='utf-8') as txt_file:
                    for line in txt_file:
                        matches = re.findall(r'NPR\s{0,4}([A-Z][A-Z.\s]+)', line)
                        if matches:
                            for match in matches:
                                all_extracted_names.append(match.strip())
                        else:
                            all_extracted_names.append("NaN")
                print(f"✅ Successfully processed: {txt_path}")
            except Exception as e:
                print(f"⚠️ Error processing {txt_path}: {str(e)}")
else:
    txt_path = input("Please enter the path to your text file: ")
    if not os.path.exists(txt_path):
        print("Error: File does not exist. Please check the path and try again.")
    else:
        try:
            with open(txt_path, 'r', encoding='utf-8') as txt_file:
                for line in txt_file:
                    matches = re.findall(r'NPR\s{0,4}([A-Z][A-Z.\s]+)', line)
                    if matches:
                        for match in matches:
                            all_extracted_names.append(match.strip())
                    else:
                        all_extracted_names.append("NaN")
            print(f"✅ Successfully processed: {txt_path}")
        except Exception as e:
            print(f"⚠️ Error processing {txt_path}: {str(e)}")

if not all_extracted_names:
    print("Error: No names extracted. Exiting.")
    exit()

extracted_df = pd.DataFrame({'EXTRACTED_NAME': all_extracted_names})

# --- Process HCS Files ---
multiple_hcs_files = input("Do you have multiple HCS files to process? (yes/no): ").lower().strip()
all_hcs_data = []

if multiple_hcs_files == 'yes':
    hcs_paths = []
    while True:
        path = input("Please enter the path to an HCS file (html/xls/csv) (or 'done' to finish): ")
        if path.lower() == 'done':
            break
        if os.path.exists(path):
            hcs_paths.append(path)
        else:
            print("Error: File does not exist. Please check the path and try again.")

    if not hcs_paths:
        print("Error: No valid HCS files provided. Exiting.")
    else:
        for hcs_path in hcs_paths:
            hcs = process_hcs_file(hcs_path)
            if hcs is not None:
                all_hcs_data.append(hcs)
                print(f"✅ Successfully processed: {hcs_path}")
        if all_hcs_data:
            hcs_df = pd.concat(all_hcs_data, ignore_index=True)
        else:
            print("Error: No HCS data successfully processed. Exiting.")
            exit()
else:
    hcs_path = input("Please enter the path to your HCS file (html/xls/csv): ")
    if not os.path.exists(hcs_path):
        print("Error: File does not exist. Please check the path and try again.")
    else:
        hcs_df = process_hcs_file(hcs_path)
        if hcs_df is None:
            print("Error: HCS file processing failed. Exiting.")
            exit()
        print(f"✅ Successfully processed: {hcs_path}")

# --- Name Matching ---
hcs_name_col = 'CRDH_NAME'  # Replace with actual column name if different

# Normalize names
hcs_df['norm'] = hcs_df[hcs_name_col].apply(normalize_name)
extracted_df['norm'] = extracted_df['EXTRACTED_NAME'].apply(normalize_name)

# Create matching keys
hcs_df['match_key'] = hcs_df['norm'].apply(lambda x: f"{x['first']}|{x['middle']}|{x['surname']}")
extracted_df['match_key'] = extracted_df['norm'].apply(lambda x: f"{x['first']}|{x['middle']}|{x['surname']}")

# Use extracted_df as base (smaller set)
matched_df = extracted_df.merge(
    hcs_df,
    how='left',
    on='match_key',
    suffixes=('_extracted', '_hcs')
)

# Add extracted name to matched rows
matched_df['MATCHED_EXTRACTED_NAME'] = matched_df['EXTRACTED_NAME']

# Clean up: keep only necessary columns and remove unmatched rows
final_df = matched_df.dropna(subset=[hcs_name_col])  # Drop rows where HCS data wasn't matched
final_df = final_df.drop(columns=['norm_extracted', 'norm_hcs', 'match_key'])

# Results
print(f"\nOriginal HCS rows: {len(hcs_df)}")
print(f"Extracted names: {len(extracted_df)}")
print(f"Final matched rows: {len(final_df)}")
print("\nFirst few rows of final DataFrame:")
print(final_df.head())

# Optionally save to CSV
save = input("Would you like to save the results to a CSV file? (yes/no): ").lower().strip()
if save == 'yes':
    output_path = input("Enter the output file path (e.g., matched_output.csv): ")
    final_df.to_csv(output_path, index=False)
    print(f"✅ Results saved to {output_path}")
else:
    print("Results not saved.")

Do you have multiple text files for name extraction? (yes/no):  yes
Please enter the path to a text file (or 'done' to finish):  today.txt
Please enter the path to a text file (or 'done' to finish):  today1.txt
Please enter the path to a text file (or 'done' to finish):  today2.txt
Please enter the path to a text file (or 'done' to finish):  today3.txt
Please enter the path to a text file (or 'done' to finish):  done


✅ Successfully processed: today.txt
✅ Successfully processed: today1.txt
✅ Successfully processed: today2.txt
✅ Successfully processed: today3.txt


Do you have multiple HCS files to process? (yes/no):  no
Please enter the path to your HCS file (html/xls/csv):  hcs.xlsx


✅ Successfully processed: hcs.xlsx


KeyError: 'E_NAME'