# Create csv file with chexpert medical annotations

In [None]:
!pip install transformers -q
!pip install sentencepiece -q

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import sklearn


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
#load MIMIC-CXR with Chexpert medical annotations
chexpert_df = pd.read_csv("https://raw.githubusercontent.com/orsho/Data-to-Text-Generation-Of-Radiologist-Reports/main/Data/mimic-cxr-2.0.0-chexpert.csv")
chexpert_df.head()

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,


In [None]:
#load MIMIC-CXR study list
study_list_df = pd.read_csv("https://raw.githubusercontent.com/orsho/Data-to-Text-Generation-Of-Radiologist-Reports/main/Data/cxr-study-list.csv")
study_list_df.head()

Unnamed: 0,subject_id,study_id,path
0,10000032,50414267,files/p10/p10000032/s50414267.txt
1,10000032,53189527,files/p10/p10000032/s53189527.txt
2,10000032,53911762,files/p10/p10000032/s53911762.txt
3,10000032,56699142,files/p10/p10000032/s56699142.txt
4,10000764,57375967,files/p10/p10000764/s57375967.txt


In [None]:
#merge the two datasets 
chex_and_paths = pd.merge(chexpert_df,study_list_df,on=['subject_id','study_id'])
#change from numeric labels (1,0,-1) to string labels ('positive','negative','uncertain')
chex_and_paths = chex_and_paths.replace([1.0,0.0,-1.0],['positive','negative','uncertain'])

#combine the medical observation with the label
for (columnName, columnData) in chex_and_paths.iteritems():
  chex_and_paths[columnName] = np.where((chex_and_paths[columnName] == 'positive'),columnName+' positive',chex_and_paths[columnName])
  chex_and_paths[columnName] = np.where((chex_and_paths[columnName] == 'negative'),columnName+' negative',chex_and_paths[columnName])
  chex_and_paths[columnName] = np.where((chex_and_paths[columnName] == 'uncertain'),columnName+' uncertain',chex_and_paths[columnName])

chex_and_paths = chex_and_paths.replace(['nan'],[np.nan])

#merge all medical annotations into on cell and drop the remaining columns
chex_and_paths['medical tags'] = chex_and_paths[chex_and_paths.columns[2:16]].apply(
    lambda x: ' ,'.join(x.dropna().astype(str)),
    axis=1
)
chex_and_paths.drop(chex_and_paths.columns[0:16], axis = 1, inplace = True)

#create impression and findings columns
chex_and_paths["report_impression"] = np.nan
chex_and_paths["report_findings"] = np.nan
chex_and_paths.head(10)


227827


Unnamed: 0,path,medical tags,report_impression,report_findings
0,files/p10/p10000032/s50414267.txt,No Finding positive,,
1,files/p10/p10000032/s53189527.txt,No Finding positive,,
2,files/p10/p10000032/s53911762.txt,No Finding positive,,
3,files/p10/p10000032/s56699142.txt,No Finding positive,,
4,files/p10/p10000764/s57375967.txt,"Consolidation positive ,Pneumonia uncertain",,
5,files/p10/p10000898/s50771383.txt,No Finding positive,,
6,files/p10/p10000898/s54205396.txt,No Finding positive,,
7,files/p10/p10000935/s50578979.txt,"Edema uncertain ,Lung Opacity uncertain ,Pleur...",,
8,files/p10/p10000935/s51178377.txt,"Lung Opacity positive ,Pneumonia uncertain",,
9,files/p10/p10000935/s55697293.txt,No Finding positive,,


In [None]:
#extract the top n words in a df column
def frequent_words(n, df, column):
    
    #count words in the given column
    count_freq ={}
    for index, report in df.iterrows():
        impression_str = report[column] 
        for word in re.findall(r'\w+', impression_str):
            if word not in count_freq.keys():
                count_freq.update({word:1}) 
            else:
                count_freq[word]+=1
     
    #sort from most frequent to the less
    count_freq = dict(sorted(count_freq.items(), key=lambda item: item[1], reverse = True))   
    #print(count_freq)
    
    #extract only the first n words
    frequent_words = []
    for index, key in enumerate(count_freq):
        if index<n:
            frequent_words.append(key)
        else:
            break
            
    return frequent_words
        
#calculate word count for a given string of words
def word_count(dataset, column):
    len_vector = []
    for text in dataset[column]:
        if len(text.split()) > 0:
          len_vector.append(len(text.split()))

    return len_vector


#impression section finder
impression_found = "IMPRESSION"
impression_str = ""
flag_impression = False

#findings section finder
findings_found = "FINDINGS"
findings_str = ""
flag_findings = False



#iterate through all reports
for i in range (len(chex_and_paths)):
    path_report = chex_and_paths.loc[i,'path']
       
    report = open("/content/drive/My Drive/Final project - Zebra/Data/"+path_report, "r") #from google drive
    #report = open("/Users/Or Shoham/"+path_report, "r") #from local pc
    #print(report.read())
    flag_impression = False
    
    #iterate through all lines in report
    for line in report:
        #if impression found append that line to impression string
        if impression_found in line:
            flag_impression = True
        if flag_impression == True:
            impression_str += str(line)
            
        #if findings found append that line to findings string
        if findings_found in line:
            flag_findings = True
        if impression_found in line:
            flag_findings = False
        if flag_findings == True:
            findings_str += str(line)
    
    #init all variables for the next report
    chex_and_paths.loc[i, 'report_impression'] = impression_str
    flag_impression = False
    impression_str = ""
    chex_and_paths.loc[i, 'report_findings'] = findings_str
    flag_findings = False
    findings_str = ""

#clean results   
chex_and_paths['report_impression'] = chex_and_paths['report_impression'].str.replace(r'\n', '')
chex_and_paths['report_findings'] = chex_and_paths['report_findings'].str.replace(r'\n', '')

chex_and_paths.head(15)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
files/p19/p19783525/s53741200.txt
files/p19/p19783525/s56694368.txt
files/p19/p19783525/s59998558.txt
files/p19/p19783770/s51681446.txt
files/p19/p19783776/s51717211.txt
files/p19/p19783776/s53875203.txt
files/p19/p19783776/s58204328.txt
files/p19/p19783776/s59245663.txt
files/p19/p19783886/s51664334.txt
files/p19/p19783898/s50431055.txt
files/p19/p19783898/s55655942.txt
files/p19/p19784186/s57069539.txt
files/p19/p19784470/s58142734.txt
files/p19/p19784489/s50475500.txt
files/p19/p19784489/s52741590.txt
files/p19/p19784489/s59362018.txt
files/p19/p19784864/s59522794.txt
files/p19/p19784979/s52721889.txt
files/p19/p19785654/s57719522.txt
files/p19/p19785672/s50124129.txt
files/p19/p19785672/s51310810.txt
files/p19/p19785672/s51979988.txt
files/p19/p19785672/s52627121.txt
files/p19/p19785672/s53315539.txt
files/p19/p19785672/s55410940.txt
files/p19/p19785715/s52070987.txt
files/p19/p19785715/s52189953.txt
files/p19/p197857

Unnamed: 0,path,medical tags,report_impression,report_findings
0,files/p10/p10000032/s50414267.txt,No Finding positive,IMPRESSION: No acute cardiopulmonary process.,"FINDINGS: There is no focal consolidation, ..."
1,files/p10/p10000032/s53189527.txt,No Finding positive,IMPRESSION: No acute cardiopulmonary abnorm...,"FINDINGS: The cardiac, mediastinal and hila..."
2,files/p10/p10000032/s53911762.txt,No Finding positive,IMPRESSION: No acute intrathoracic process.,FINDINGS: Single frontal view of the chest ...
3,files/p10/p10000032/s56699142.txt,No Finding positive,IMPRESSION: No acute cardiopulmonary process.,FINDINGS: The lungs are clear of focal cons...
4,files/p10/p10000764/s57375967.txt,"Consolidation positive ,Pneumonia uncertain",IMPRESSION: Focal consolidation at the left...,FINDINGS: PA and lateral views of the chest...
5,files/p10/p10000898/s50771383.txt,No Finding positive,IMPRESSION: No acute intrathoracic process.,FINDINGS: PA and lateral views of the chest...
6,files/p10/p10000898/s54205396.txt,No Finding positive,IMPRESSION: No evidence of acute cardiopulm...,FINDINGS: As compared to the prior examinat...
7,files/p10/p10000935/s50578979.txt,"Edema uncertain ,Lung Opacity uncertain ,Pleur...",IMPRESSION: 1. Low lung volumes and mild pul...,FINDINGS: Lung volumes remain low. There ar...
8,files/p10/p10000935/s51178377.txt,"Lung Opacity positive ,Pneumonia uncertain",IMPRESSION: Increasing left lung opacificati...,FINDINGS: The lung volumes are low. The car...
9,files/p10/p10000935/s55697293.txt,No Finding positive,IMPRESSION: Stable chest radiograph.,FINDINGS: Heart size is normal. Mediastinal...


## download to CSV

In [None]:
from google.colab import drive
from google.colab import files
chex_and_paths.to_csv('chexpert labels_and_reports.csv', index = False)
files.download("chexpert labels_and_reports.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>