In [None]:
# Cell 1: Setup and Imports
import pandas as pd
import multiprocessing as mp
import medspacy
from medspacy.ner import TargetRule
import time
import os
# Cell 6: Benchmarking and Comparison
import matplotlib.pyplot as plt


In [None]:

# Create a sample DataFrame
data = [
    'Patient shows symptoms of flu and other complications.',
    'Diagnosis indicates pneumonia.',
    'Follow-up required for diabetes management.',
    'Patient is recovering well, but needs to keep taking beta blockers for hypertension.'
] * 10000
df = pd.DataFrame({'text': data})

# Cell 2: Define Target Rules and NLP Pipeline
target_rules = [
    TargetRule(literal="flu", category="PROBLEM"),
    TargetRule("pneumonia", "PROBLEM"),
    TargetRule("hemicolectomy", "TREATMENT"),
    TargetRule("beta blockers", "TREATMENT"),
    TargetRule("hypertension", "PROBLEM"),
    TargetRule("diabetes", "PROBLEM"),
]


def create_nlp():
    nlp = medspacy.load()
    target_matcher = nlp.get_pipe("medspacy_target_matcher")
    target_matcher.result_type = 'group'
    target_matcher.add(target_rules)
    return nlp


# Cell 3: Process Text Function
def process_text(text):
    print(f"Process ID: {os.getpid()} | Processing text: {text}")
    nlp = create_nlp()
    doc = nlp(text)
    return doc


# Cell 4: Single Process Execution
def process_dataframe_single_process(df):
    results = [process_text(text) for text in df['text']]
    return results


# Cell 5: Multiprocessing Execution
def process_dataframe_multiprocess(df, num_processes):
    with mp.Pool(num_processes) as pool:
        results = pool.map(process_text, df['text'])
    return results


In [None]:


start_time = time.time()
single_process_results = process_dataframe_single_process(df)
single_process_duration = time.time() - start_time
print(f"Single process time: {single_process_duration:.2f} seconds")


In [None]:


num_processes_list = [2, 4, 8]
multiprocess_durations = []

for num_processes in num_processes_list:
    start_time = time.time()
    multiprocess_results = process_dataframe_multiprocess(df, num_processes)
    duration = time.time() - start_time
    multiprocess_durations.append((num_processes, duration))
    print(f"Multiprocess ({num_processes} processes) time: {duration:.2f} seconds")

In [None]:

# Prepare data for visualization
process_counts = [1] + num_processes_list
durations = [single_process_duration] + [duration for num_processes, duration in multiprocess_durations]

plt.figure(figsize=(10, 6))
plt.plot(process_counts, durations, marker='o')
plt.xlabel('Number of Processes')
plt.ylabel('Time (seconds)')
plt.title('Single Process vs Multiprocessing Execution Time')
plt.grid(True)
plt.show()