# **Medical Transcriptions**

### **Dataset Preprocessing**

In [1]:
from google.colab import files
import pandas as pd
import io

df = pd.read_csv('/content/mtsamples.csv')
print(df)

      Unnamed: 0                                        description  \
0              0   A 23-year-old white female presents with comp...   
1              1           Consult for laparoscopic gastric bypass.   
2              2           Consult for laparoscopic gastric bypass.   
3              3                             2-D M-Mode. Doppler.     
4              4                                 2-D Echocardiogram   
...          ...                                                ...   
4994        4994   Patient having severe sinusitis about two to ...   
4995        4995   This is a 14-month-old baby boy Caucasian who...   
4996        4996   A female for a complete physical and follow u...   
4997        4997   Mother states he has been wheezing and coughing.   
4998        4998   Acute allergic reaction, etiology uncertain, ...   

                medical_specialty                                sample_name  \
0            Allergy / Immunology                         Allergic 

### **Age Extraction**

In [2]:
%pip install transformers



In [3]:
import torch
import re
from transformers import AutoTokenizer, AutoModel

access_token = "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"

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1")

# Define a pattern for age
age_pattern = r'(\d{1,3})[ -]year[ -]old|\bage[ -]?(\d{1,3})|\baged[ -]?(\d{1,3})\b'

# Function to extract age from text
def extract_age(text):
    matches = re.finditer(age_pattern, text, re.IGNORECASE)
    for match in matches:
        # This will return the first captured group that is not None
        age = next((group for group in match.groups() if group is not None), None)
        if age:
            return int(age)
    return None

# Ensure both columns are treated as strings (to handle any NaN or None values)
df['information'] = df['description'].astype(str) + " " + df['transcription'].astype(str)

# Apply the extract_age function
df['age'] = df['information'].apply(extract_age)

# Let's try to display the results
print(df[['description', 'transcription', 'age']].head())
# df.to_csv('/content/age.csv', index=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

                                         description  \
0   A 23-year-old white female presents with comp...   
1           Consult for laparoscopic gastric bypass.   
2           Consult for laparoscopic gastric bypass.   
3                             2-D M-Mode. Doppler.     
4                                 2-D Echocardiogram   

                                       transcription   age  
0  SUBJECTIVE:,  This 23-year-old white female pr...  23.0  
1  PAST MEDICAL HISTORY:, He has difficulty climb...   NaN  
2  HISTORY OF PRESENT ILLNESS: , I have seen ABC ...   NaN  
3  2-D M-MODE: , ,1.  Left atrial enlargement wit...   NaN  
4  1.  The left ventricular cavity size and wall ...   NaN  


### **Treatment Extraction**

In [7]:
import torch
from transformers import AutoTokenizer, AutoModel

access_token = "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"

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1")

def extract_treatment_sentences(text):
    # Placeholder for keywords related to treatments
    treatment_keywords = ['treatment', 'therapy', 'surgery', 'procedure', 'medication', 'radiation']

    # Split the text into sentences
    sentences = text.split('. ')

    # Filter sentences that contain any of the treatment keywords
    treatment_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in treatment_keywords)]

    treatment_info = ' '.join(treatment_sentences)

    return treatment_info

# Apply the function to extract treatment-related sentences
df['treatment_info'] = df['transcription'].apply(lambda x: extract_treatment_sentences(x) if pd.notnull(x) else None)

# Check the results
print(df[['transcription', 'treatment_info']].head())
df.to_csv('/content/new_mtsamples.csv', index=False)

                                       transcription  \
0  SUBJECTIVE:,  This 23-year-old white female pr...   
1  PAST MEDICAL HISTORY:, He has difficulty climb...   
2  HISTORY OF PRESENT ILLNESS: , I have seen ABC ...   
3  2-D M-MODE: , ,1.  Left atrial enlargement wit...   
4  1.  The left ventricular cavity size and wall ...   

                                      treatment_info  
0   She does have asthma but doest not require da...  
1   He has gastroesophageal reflux disease.,PAST ...  
2   He is a diabetic, on medication  He has hemor...  
3                                                     
4                                                     
