In [8]:
import os
import pandas as pd
import re

# Hard-coded path to the EMIS Medical Dictionary file
EMIS_DICTIONARY_PATH = r"C:\Users\mcken\OneDrive\My PC Folder\Desktop\Work\CPRD\Codelists\CL Generator\Codelist-Generation-main\Codelist-Generation-main\data\EMISMedicalDictionary_2022.txt"
# Hard-coded path to save the results (defaulting to the same directory as the EMIS file)
SAVE_PATH = os.path.dirname(r"C:\Users\mcken\OneDrive\My PC Folder\Desktop\Work\CPRD\Codelists\CL Generator\Codelist-Generation-main\Codelist-Generation-main\data\Demo_loc")


In [9]:
def load_dataframe(file_path):
    try:
        # Determine the separator based on file extension
        _, file_extension = os.path.splitext(file_path)
        if file_extension.lower() == '.csv':
            sep = ','
        else:
            sep = '\t'
        
        df = pd.read_csv(file_path, sep=sep, on_bad_lines='skip')  # Using 'on_bad_lines' instead of 'error_bad_lines'
        print("Columns in the file:")
        for i, col in enumerate(df.columns):
            print(f"{i}. {col}")
        col_idx = int(input("Enter the index number of the column to use: "))
        selected_col = df.columns[col_idx]
        return df, selected_col
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None


In [10]:
def get_search_terms():
    choice = input("Would you like to (1) enter terms manually or (2) load terms from a file? Enter 1 or 2: ")
    if choice == '1':
        terms = input("Enter terms separated by commas: ").split(',')
        terms = [term.strip().lower() for term in terms]
    elif choice == '2':
        file_path = input("Enter the path to the terms file: ").replace('"', '')  # Remove any quotation marks
        with open(file_path, 'r') as file:
            content = file.read()
            terms = content.splitlines()
            terms = [term.strip().lower() for term in terms]
    else:
        print("Invalid input. Please enter 1 or 2.")
        return get_search_terms()
    return terms


In [11]:
def find_matches(df, col, terms):
    pattern = '|'.join([re.escape(term) for term in terms]) 
    df['Matched Terms'] = df[col].apply(lambda x: ', '.join(set(term for term in terms if re.search(r'\b' + re.escape(term) + r'\b', x.lower()))))
    matched_df = df[df['Matched Terms'] != '']
    return matched_df


In [12]:
def apply_exclusion_terms(df, col):
    exclusion_terms = input("Enter exclusion terms separated by commas (leave empty if none): ").split(',')
    if exclusion_terms == ['']:
        return df
    exclusion_pattern = '|'.join([re.escape(term.strip().lower()) for term in exclusion_terms])
    original_count = len(df)
    df = df[~df[col].str.lower().apply(lambda x: bool(re.search(exclusion_pattern, x)))]
    removed_count = original_count - len(df)
    print(f"Rows after exclusion: {len(df)}")
    print(f"Rows removed: {removed_count}")
    return df


In [13]:
def main():
    terms = get_search_terms()

    df1, col1 = load_dataframe(EMIS_DICTIONARY_PATH)
    
    if df1 is None:
        return

    matched_df = find_matches(df1, col1, terms)
    print(f"Initial matches found: {len(matched_df)}")
    if not matched_df.empty:
        # Display a snapshot of initial matches and ask for exclusion terms
        print(matched_df.head())  # Optionally display a few rows to help decide on exclusions
        matched_df = apply_exclusion_terms(matched_df, col1)

        # Filter to keep only specified columns
        columns_to_keep = ['Med_Code_Id', 'Observations', 'Term', 'SNOMED_CT_Concept_ID', 'Matched Terms']
        matched_df = matched_df[columns_to_keep]

        # Ask user for save directory
        save_directory = input(f"Enter the directory to save the results (default: {DEFAULT_SAVE_PATH}): ").strip()
        if save_directory == '':
            save_directory = DEFAULT_SAVE_PATH
        
        if not os.path.exists(save_directory):
            print(f"The directory {save_directory} does not exist. Creating the directory.")
            os.makedirs(save_directory)
        
        default_filename = os.path.basename(EMIS_DICTIONARY_PATH).split('.')[0] + '_matched_results.csv'
        filename = input(f"Enter the filename for the saved results (default: {default_filename}): ").strip()
        if filename == '':
            filename = default_filename
        elif not filename.endswith('.csv'):
            filename += '.csv'  # Append .csv if not present
        output_file_path = os.path.join(save_directory, filename)
        
        try:
            matched_df.to_csv(output_file_path, index=False)
            print(f"Matched data saved to {output_file_path}")
        except PermissionError:
            print(f"Permission denied when trying to write to {output_file_path}. Please check your permissions.")
    else:
        print("No matches found.")


In [14]:
if __name__ == "__main__":
    main()


Would you like to (1) enter terms manually or (2) load terms from a file? Enter 1 or 2: 1
Enter terms separated by commas: cancer, hyper
Columns in the file:
0. Unnamed: 0
1. Med_Code_Id
2. Observations
3. OriginalReadCode
4. CleansedReadCode
5. Term
6. SNOMED_CT_Concept_ID
7. SnomedCTDescriptionId
8. Release
9. EmisCodeCategoryId
Enter the index number of the column to use: 5
Initial matches found: 538
       Unnamed: 0 Med_Code_Id  Observations OriginalReadCode CleansedReadCode  \
6121         6121  a216211016         30000             1J0I          1J0I.00   
8878         8878  a253467018         70000             1J00          1J00.00   
23870       23870  a289554017           200          B58y2-1          B58y211   
24750       24750  a292126018           100            Byu51          Byu5100   
45100       45100  a397821018         20000            142-1          142..11   

                                 Term SNOMED_CT_Concept_ID  \
6121          Suspected breast cancer       