In [13]:
import pandas as pd
import nltk
import re
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [14]:
# Load datasets
notes = pd.read_csv("NOTEEVENTS.csv", low_memory=False)  # Load clinical notes
diag = pd.read_csv("DIAGNOSES_ICD.csv", low_memory=False)  # Load diagnosis codes
iDiag = pd.read_csv("D_ICD_DIAGNOSES.csv", low_memory=False)  # Load ICD descriptions

# Combine the SHORT_TITLE and LONG_TITLE columns
iDiag['CombinedTitles'] = iDiag[['SHORT_TITLE', 'LONG_TITLE']].fillna('').agg(' '.join, axis=1)  # Merge title columns

In [15]:
vehicleRegex = (
    r'\b(?:mva|mba|vehicle|bus|pedestrian|passenger|ute|ped|bike|dirtbike|motorbike|pushbike|scooter|truck|'
    r'bicycle|motorcycle|driver|driving|rtc|rta|\d*km[a-zA-Z/]*|skateboard|surfing|surf|horse|collision|'
    r'crossing|buggy|ebike|jetski|vs car|car vs|car accident|moving car|traffic light|traffic lights|'
    r'hit by car|hit by a car|car hit|airbag|airbags|T boned)\b'
)

iDiagFiltered = iDiag[iDiag['CombinedTitles'].str.contains(vehicleRegex, case=False, na=False)]  # Filter titles

In [16]:
# Text preprocessing function
def preprocessText(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-letters
    tokens = word_tokenize(text)  # Tokenize text
    stopWords = set(stopwords.words('english'))  # Load stopwords
    tokens = [word for word in tokens if word not in stopWords]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()  # Initialize lemmatizer
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize tokens
    return ' '.join(tokens)  # Return processed text

iDiagFiltered = iDiagFiltered.copy()  # Make a copy to avoid modifying the original DataFrame
iDiagFiltered['ProcessedText'] = iDiagFiltered['CombinedTitles'].apply(preprocessText)  # Preprocess titles

In [17]:
# Create a document-term matrix
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000)  # Initialize vectorizer
dtm = vectorizer.fit_transform(iDiagFiltered['ProcessedText'])  # Create document-term matrix

# Fit LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)  # Initialize LDA
lda.fit(dtm)  # Fit LDA model

# Display the top words per topic
def displayTopics(model, featureNames, noTopWords):
    topicWords = {}
    for topicIdx, topic in enumerate(model.components_):  # Iterate topics
        topWords = " ".join([featureNames[i] for i in topic.argsort()[:-noTopWords - 1:-1]])  # Get top words
        topicWords[topicIdx] = topWords  # Store topic words
        print(f"\nTopic {topicIdx}:")
        print(topWords)  # Print topic words
        print("\n" + "-"*80 + "\n")
    return topicWords

noTopWords = 1000
topics = displayTopics(lda, vectorizer.get_feature_names_out(), noTopWords)  # Display topic words


Topic 0:
vehicle rider animal accident injuring animaldrawn occupant veh cycle pedal mvoth ped drawn anim person motor cyc accanim cyclist car streetcar unspecified specified nec no accpers accocc st vehpers accped objrider objped stndng crashing crash injury objdriv objpsgr suicide selfinflicted cycl psg involving rid collmcyc collpers collision driver collanim mvanim nosanim ridden accst pedestrian accpedest cont traff collst collmotcycl collped noncollision coll collpedest traffic nature motorcycle boarding alighting nontraffic colldriver collpasngr mv ntraf stnd stationary highway mov moving mvtrain oth passenger object loss stn due another train motordriven reentrant snow motorcyclist without undetermined accidentally inflicted purposely whether acc control offroad poisonexhaust poisoning exhaust gas nosped hit trainped transport mvpers fire explosion burning necped boardalightpsgr antecedent derailment boardalightdriv accidental accmcycl mvmcyc fall derailpers mvpedest mvmocycl 

In [18]:
# Get the topic distribution for each document
topicDistribution = lda.transform(dtm)  # Get topic distribution

# Identify documents NOT associated with Topic 2
nonTopic2Docs = iDiagFiltered[topicDistribution.argmax(axis=1) != 2]  # Filter out irrelevant topics

# Re-create the document-term matrix for the remaining documents
dtmRelevant = vectorizer.fit_transform(nonTopic2Docs['ProcessedText'])  # Recreate document-term matrix

# Re-run the LDA model on the filtered documents
ldaRelevant = LatentDirichletAllocation(n_components=4, random_state=42)  # Initialize new LDA
ldaRelevant.fit(dtmRelevant)  # Fit new LDA model

# Display the top words per topic in the filtered dataset
topicsRelevant = displayTopics(ldaRelevant, vectorizer.get_feature_names_out(), noTopWords)  # Display topics


Topic 0:
vehicle motor traffic accident injuring mv collision involving motorcycle unspecified passenger nature acc reentrant noncollision alighting boarding pedestrian nontraffic occupant coll driver traff streetcar car ntraf motorcyclist another mvoth veh psgr ntraff no person collpers colldriver collpasngr collst collpedest collis collanim nosdriver nospasngr collmcyc boardalightdriv boardalightpsgr nosst nosmcycl necst collmotcycl nospedest necmocycl necpedest boardalightpers psg accmcycl objpasngr injury train collped nospers nospassenger mvtrain necpers highway brdalitpers pedespers nec objpers specified objpedest nosanim accpers necped cycl objst rid nosped rider oth accdriv objmocycl cyc accped cyclist accpsgr animaldrawn pedal animal accanim accst objped nospedestrian cont offroad motordriven snow drawn without loss control due rr railway mvpers transport necpassenger st ridden accpedest ped accocc object objper anim fall hit stn stnd vehpers road trainped rd cycle operation 

In [19]:
# Extract ICD9 codes for the remaining relevant topics
icd9Relevant = nonTopic2Docs['ICD9_CODE'].unique()  # Extract relevant ICD-9 codes

# Print the extracted ICD-9 codes
print("\nUnique ICD-9 Codes for Relevant Topics:")
print(icd9Relevant)

# Merge DIAGNOSES_ICD with icd9Relevant to get relevant patient and admission IDs
relevantPatients = diag[diag['ICD9_CODE'].isin(icd9Relevant)][['SUBJECT_ID', 'HADM_ID']]  # Filter patients

# Drop duplicates to ensure we have unique patient and admission IDs
relevantPatients = relevantPatients.drop_duplicates()  # Remove duplicates

# Merge NOTEEVENTS with relevantPatients to extract matching observations
relevantNotes = pd.merge(notes, relevantPatients, on=['SUBJECT_ID', 'HADM_ID'], how='inner')  # Merge notes

# Display the relevant observations
print(f"Number of matching observations: {len(relevantNotes)}")
print(relevantNotes.head())  # Show relevant notes


Unique ICD-9 Codes for Relevant Topics:
['E8213' 'E8214' 'E8215' 'E8216' 'E8217' 'E8218' 'E8219' 'E8220' 'E8221'
 'E8222' 'E8223' 'E8224' 'E8225' 'E8226' 'E8227' 'E8228' 'E8229' 'E8230'
 'E8231' 'E8232' 'E8233' 'E8234' 'E8235' 'E8236' 'E8237' 'E8238' 'E8239'
 'E8240' 'E8241' 'E8242' 'E8243' 'E8244' 'E8245' 'E8259' 'E8260' 'E8261'
 'E8262' 'E8263' 'E8264' 'E8268' 'E8269' 'E8270' 'E8272' 'E8273' 'E8274'
 'E8278' 'E8279' 'E8280' 'E8284' 'E8290' 'E8294' 'E8298' 'E8299' 'E8252'
 'E8253' 'E8254' 'E8255' 'E8256' 'E8257' 'E8258' 'E8850' 'E8852' 'E8860'
 'E8869' 'E8000' 'E8001' 'E8002' 'E8003' 'E8008' 'E8009' 'E8010' 'E8011'
 'E8117' 'E8118' 'E8119' 'E8120' 'E8121' 'E8122' 'E8123' 'E9290' 'E9585'
 'E8041' 'E8042' 'E8043' 'E8051' 'E8052' 'E8053' 'V6511' 'E8323' 'E8124'
 'E8012' 'E8013' 'E8018' 'E8019' 'E8020' 'E8021' 'E8022' 'E8023' 'E8028'
 'E8029' 'E8031' 'E8032' 'E8033' 'E8125' 'E8126' 'E8127' 'E8128' 'E8129'
 'E8130' 'E8131' 'E8132' 'E8133' 'E8134' 'E8135' 'E8136' 'E8137' 'E8138'
 'E8139' '

In [20]:
# Step 1: Extract ICD-9 Codes for Relevant Topics
icd9Relevant = nonTopic2Docs['ICD9_CODE'].unique()

# Step 2: Filter Relevant Patients and Notes
relevantPatients = diag[diag['ICD9_CODE'].isin(icd9Relevant)][['SUBJECT_ID', 'HADM_ID']]
relevantPatients = relevantPatients.drop_duplicates()

relevantNotes = pd.merge(notes, relevantPatients, on=['SUBJECT_ID', 'HADM_ID'], how='inner')

# Step 3: Define the regex pattern for the relevant topic (kinetic accidents)
vehicle_regex = r'\b(?:mva|mba|vehicle|bus|pedestrian|passenger|ute|ped|bike|dirtbike|motorbike|pushbike|scooter|truck|bicycle|motorcycle|driver|driving|rtc|rta|\d*km[a-zA-Z/]*|skateboard|surfing|surf|horse|collision|crossing|buggy|ebike|jetski|vs car|car vs|car accident|moving car|traffic light|traffic lights|hit by car|hit by a car|car hit|airbag|airbags|T boned)\b'

# Step 4: Extract Relevant Lines or Return None
def extract_relevant_lines(text, regex_pattern):
    lines = text.split('\n')
    matching_lines = [line for line in lines if re.search(regex_pattern, line, re.IGNORECASE)]
    return matching_lines if matching_lines else None

matched_texts_relevant_lines = relevantNotes['TEXT'].apply(lambda text: extract_relevant_lines(text, vehicle_regex))

# Step 5: Combine Relevant Lines with HADM_ID
matched_texts_combined = pd.DataFrame({
    'HADM_ID': relevantNotes['HADM_ID'],
    'Relevant_Lines': matched_texts_relevant_lines
})

# Display the extracted information
print(matched_texts_combined.head())

    HADM_ID                                     Relevant_Lines
0  134640.0  [Pedestrian struck by motor vehicle, 60 year o...
1  127159.0  [involved in a motor vehicle accident. She was...
2  188655.0  [post motor vehicle accident on [**2132-7-9**]...
3  188655.0  [male who was a restrained passenger in a 110 ...
4  188655.0                                               None


In [21]:
# Filter for rows where Relevant_Lines is not null
nnl = matched_texts_combined[matched_texts_combined['Relevant_Lines'].notnull()]

# Extract the HADM_ID and the non-null Relevant_Lines
nnl = nnl[['HADM_ID', 'Relevant_Lines']]

# Display the extracted information
print(nnl.head())

len(nnl)

nnl['label'] = np.ones(len(nnl))
nnl.head()

    HADM_ID                                     Relevant_Lines
0  134640.0  [Pedestrian struck by motor vehicle, 60 year o...
1  127159.0  [involved in a motor vehicle accident. She was...
2  188655.0  [post motor vehicle accident on [**2132-7-9**]...
3  188655.0  [male who was a restrained passenger in a 110 ...
5  191263.0  [This is a 47 year old female who was in a mot...


Unnamed: 0,HADM_ID,Relevant_Lines,label
0,134640.0,"[Pedestrian struck by motor vehicle, 60 year o...",1.0
1,127159.0,[involved in a motor vehicle accident. She was...,1.0
2,188655.0,[post motor vehicle accident on [**2132-7-9**]...,1.0
3,188655.0,[male who was a restrained passenger in a 110 ...,1.0
5,191263.0,[This is a 47 year old female who was in a mot...,1.0


In [25]:
import pandas as pd

# Assuming 'diag' contains the ICD9_CODE information and 'notes' is your NOTEEVENTS DataFrame
# Merge notes with diag to ensure ICD9_CODE is in notes
notes_with_icd9 = pd.merge(notes, diag[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']], on=['SUBJECT_ID', 'HADM_ID'], how='left')

# Step 1: Filter NOTEEVENTS to exclude rows where ICD9_CODE matches icd9Relevant
non_matching_notes = notes_with_icd9[~notes_with_icd9['ICD9_CODE'].isin(icd9Relevant)]

# Step 2: Randomly sample len(nnl) rows from the non-matching notes
random_sample = non_matching_notes['TEXT'].sample(n=len(nnl), random_state=42)

# Step 3: Display the sampled rows
print(random_sample.head())

print(len(random_sample))

9299169     Pneumonia, bacterial, community acquired (CAP)...
10461640    Chief Complaint:\n   24 Hour Events:\n   Comfo...
23852980    Neonatology - NNP Physical Exam\n\nPlease see ...
3693201     Demographics\n   Day of intubation:\n   Day of...
3381758     Heart failure, right, isolated (Cor Pulmonale)...
Name: TEXT, dtype: object
22786


In [29]:
# Step 1: Merge 'notes' with 'diag' to ensure 'ICD9_CODE' is included
notes_with_icd9 = pd.merge(notes, diag[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']], on=['SUBJECT_ID', 'HADM_ID'], how='left')

# Step 2: Filter 'NOTEEVENTS' to exclude rows where 'ICD9_CODE' matches 'icd9Relevant'
non_matching_notes = notes_with_icd9[~notes_with_icd9['ICD9_CODE'].isin(icd9Relevant)]

# Step 3: Randomly sample 'len(nnl)' rows from the non-matching notes
random_sample = non_matching_notes[['HADM_ID', 'TEXT']].sample(n=len(nnl), random_state=42)

# Step 4: Assign label 0 to the sampled data and rename 'TEXT' to 'Relevant_Lines'
random_sample_df = pd.DataFrame({
    'HADM_ID': random_sample['HADM_ID'],
    'Relevant_Lines': random_sample['TEXT'],
    'label': 0
})

# Step 5: Ensure 'nnl' has label 1
nnl['label'] = 1

# Step 6: Concatenate 'nnl' with the sampled data
combined_df = pd.concat([nnl, random_sample_df], ignore_index=True)

# Step 7: Display the first few rows of the combined DataFrame
print(combined_df.head())

    HADM_ID                                     Relevant_Lines  label
0  134640.0  [Pedestrian struck by motor vehicle, 60 year o...      1
1  127159.0  [involved in a motor vehicle accident. She was...      1
2  188655.0  [post motor vehicle accident on [**2132-7-9**]...      1
3  188655.0  [male who was a restrained passenger in a 110 ...      1
4  191263.0  [This is a 47 year old female who was in a mot...      1


In [30]:
print(combined_df['label'].value_counts())

label
1    22786
0    22786
Name: count, dtype: int64


In [31]:
# Save the combined DataFrame to a CSV file
combined_df.to_csv('train.csv', index=False)

# Confirm the save by printing a message
print("combined_df has been saved to 'train.csv'")

combined_df has been saved to 'train.csv'
