[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/medspacy/medspacy/blob/simple_scale/notebooks/benchmark_scaling/benchmark_multithread.ipynb)


In [None]:
!pip install medspacy

In [None]:

import pandas as pd
import threading
from queue import Queue
import medspacy
from medspacy.ner import TargetRule
import time


In [None]:

# Create a larger sample DataFrame for benchmarking
data = ['Patient shows symptoms of flu and other complications.',
        'Diagnosis indicates pneumonia.',
        'Follow-up required for diabetes management.',
        'Patient is recovering well, but needs to keep taking beta blockers for hypertension.'] * 1000  # Multiply to increase size
df = pd.DataFrame({'text': data})


# Initialize MedSpaCy pipeline and add target rules
def setup_nlp():
    nlp = medspacy.load()
    target_matcher = nlp.get_pipe("medspacy_target_matcher")
    target_rules = [
        TargetRule(literal="flu", category="PROBLEM"),
        TargetRule("pneumonia", "PROBLEM"),
        TargetRule("hemicolectomy", "TREATMENT"),
        TargetRule("beta blockers", "TREATMENT"),
        TargetRule("hypertension", "PROBLEM"),
        TargetRule("diabetes", "PROBLEM"),
    ]
    target_matcher.add(target_rules)
    return nlp


# Shared instance of NLP for the single instance benchmark
single_nlp = setup_nlp()


# Function to process using a single nlp instance
def process_dataframe_single(df):
    results = []
    for text in df['text']:
        doc = single_nlp(text)
        results.append(doc)
    return results


# Function to process text using MedSpaCy in multithread
def process_text_multithread(text, output_queue, nlp_instance):
    doc = nlp_instance(text)
    output_queue.put(doc)


# Function to process DataFrame with multithreading
def process_dataframe_multithread(df, num_threads):
    output_queue = Queue()
    threads = []
    nlp_pool = [setup_nlp() for _ in range(num_threads)]

    def worker(texts, nlp):
        for text in texts:
            process_text_multithread(text, output_queue, nlp)

    # Split the texts among the threads
    for i in range(num_threads):
        thread_texts = df['text'][i::num_threads]
        thread = threading.Thread(target=worker, args=(thread_texts, nlp_pool[i]))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    results = []
    while not output_queue.empty():
        results.append(output_queue.get())

    return results

In [None]:
start_time = time.time()
single_results = process_dataframe_single(dataframe)
single_duration = time.time() - start_time
print(f"Single instance processing time: {single_duration:.2f} seconds")



In [None]:
# Benchmark multithreaded processing
num_threads = 3  # You can change the number of threads to test as needed
start_time = time.time()
multi_results = process_dataframe_multithread(dataframe, num_threads)
multi_duration = time.time() - start_time
print(f"Multithreaded processing time with {num_threads} threads: {multi_duration:.2f} seconds")