In [186]:
import re
import os
import glob
import shutil
import time
import subprocess

import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 255)

#### Download Patient Dataset

In [3]:
patient_df = pd.read_csv('./out/metadata/patients.csv')

In [4]:
patient_df = patient_df[['Patient', 'Link Drive']]

In [None]:
for index, row in patient_df.iterrows():
    subprocess.run(["./stethosoul_downloader.sh", row['Patient'], row['Link Drive']], capture_output=False)

#### Download ControlDataset

In [5]:
control_df = pd.read_csv('./out/metadata/controls.csv')

In [6]:
control_df = control_df[['Patient ID', 'Link']]

In [446]:
!cat ./stethosoul_downloader.sh

#!/bin/bash

# Check if exactly two parameters are provided
if [ "$#" -ne 2 ]; then
  echo "Please provide <patientCode> <driveUrl>"
  exit 1
fi

patientCode="$1"

driveUrl="$2"

targetFolder="/Users/naufalihsan/Downloads/stethosoul_clean/controls/$patientCode"

gdown $driveUrl -O $targetFolder --folder


In [None]:
for index, row in control_df.iterrows():
    subprocess.run(["./stethosoul_downloader.sh", row['Patient ID'], row['Link']], capture_output=False)

#### Fn Utils

In [20]:
class Patient:
    def __init__(self, patient_code):
        self.patient_code = patient_code
        
    def get_folder(self):
        return f'./patients/{self.patient_code}'
    
    def get_out_file(self):
        return f'./out/patients/{self.patient_code}.json'

In [21]:
class Control:
    def __init__(self, patient_code):
        self.patient_code = patient_code
        
    def get_folder(self):
        return f'./controls/{self.patient_code}'
    
    def get_out_file(self):
        return f'./out/controls/{self.patient_code}.json'

In [339]:
def normalize_text(text):
    return text.replace('\n', '')

In [9]:
def get_topic(text):
    topic = int(text.split('-')[2])
    return f'T{topic}'

In [21]:
def get_label(text):
    try:
        if '_' in text:
            delim = '_'
        else:
            delim = '-'
        
        return int(text.split(delim)[0])
    except:
        return 0

In [178]:
def is_label(text):
    return re.match('^\d+[-_][A-Za-z\s]*$', text)

In [175]:
def format_notula(patient):
    data = []
    
    encodings = ['utf-8', 'utf-16', 'utf-16-le']
    
    for fn in sorted(glob.glob(f'{patient.get_folder()}/*.txt')):
        for encoding in encodings:
            try:
                with open(fn, encoding=encoding, errors='ignore') as f:
                    lines = f.readlines()
                    filtered_list = list(filter(lambda line: line and line.strip(), lines))
                    modified_list = list(map(lambda text: normalize_text(text), filtered_list))        

                    data.append({
                        'patient': patient.patient_code,
                        'topic': get_topic(modified_list[0]),
                        'label': get_label(modified_list[1]),
                        'text': ''.join(modified_list[3:]) if len(modified_list) > 2 else None
                    })
                    
                    break
            except:
                pass

        
    df = pd.DataFrame.from_dict(data)
    df.to_json(patient.get_out_file())

In [396]:
def format_paragraph(patient):
    data = []
    
    encodings = ['utf-8', 'utf-16', 'utf-16-le']
    
    for fn in sorted(glob.glob(f'{patient.get_folder()}/*.txt')):
        with open(fn, encoding=encodings[0], errors='ignore') as f:
            lines = f.readlines()
            filtered_list = list(filter(lambda line: line and line.strip(), lines))
            modified_list = list(map(lambda text: normalize_text(text), filtered_list))

            # TODO: Optimize read paragraphs
    
    df = pd.DataFrame.from_dict(data)
    df.to_json(patient.get_out_file())

In [23]:
def convert_to_json(patient, format=None):
    try:
        if not format or format == 'notula':
            format_notula(patient)
        elif format == 'paragraph':
            format_paragraph(patient)
    except Exception as e:
        print(e)

#### Convert Patients Data to JSON

In [454]:
meta = pd.read_json('./out/metadata/patients.json')

In [513]:
exclude_mislabeled = {
    'ED24', 
    'AK23', 
    'AQ19', 
    'YG78', 
    'TQ36', 
    'PK56', 
    'QX63', 
    'LC01', 
    'HD95',
}

In [514]:
exclude_paragraph = {
    'KY48', # utf-8
    'ZN39', # utf-8
    'DC60', # utf-16-le
    'AK93', # utf-16-le
    'WK67', # utf-16-le
    'PM27', # utf-16-le
    'MB63', # utf-16-le
    'IT99', # utf-16-le
    'PJ95', # utf-16-le
    'WJ82', # utf-16-le
    'NH78', # utf-8
    'GR42', # utf-16-le
    'MC64', # utf-8
    'ER54', # utf-8
    'QM18', # utf-16-le
    'KM01', # utf-16-le
    'CG88', # utf-16-le
    'DE79', # utf-16-le
    'DB90', # utf-16-le
    'IB97', # utf-16-le
    'UR50', # utf-16-le
    'FC59', # utf-8
    'MU30', # utf-16-le
    'QR25', # utf-16-le
    'FN52', # utf-8
}

In [515]:
exclude_split = {
    'FS73', # utf-8
    'RR54', # utf-8
    'TS31', # utf-8
    'DE60', # utf-8
}

In [511]:
exclusion_lists = [exclude_paragraph, exclude_mislabeled, exclude_split]

In [None]:
for index, row in patient_df.iterrows():
    patient = Patient(row['Patient'])
    
    if not any(patient.patient_code in exclusion_list for exclusion_list in exclusion_lists):
        convert_to_json(patient)

        try:
            out = pd.read_json(patient.get_out_file())
            out_labels = out['label'].tolist()
        except:
            print(patient.patient_code)
            raise Exception
            
            
        # metadata checker
        query = meta.query(f'Patient == "{patient.patient_code}"').filter(regex=r'^T\d+', axis=1)
        csv_labels = query.iloc[0].tolist()

        if not out_labels == csv_labels:
            diff_indices = [f'T{i+1}' for i, (x, y) in enumerate(zip(out_labels, csv_labels)) if x != y]
            print(f'{patient.patient_code}', diff_indices)

#### Insert Mislabeled Data

In [14]:
manual_df = pd.read_csv('./out/manual/D1_KNK_Manual [ML].csv')

In [18]:
patient_codes = manual_df['patient'].unique().tolist()

In [23]:
patient_codes

['ED24', 'AK23', 'AQ19', 'YG78', 'TQ36', 'PK56', 'QX63', 'LC01', 'HD95']

In [22]:
for patient_code in patient_codes:
    patient = Patient(patient_code)
    
    patient_df = manual_df.query(f'patient == "{patient_code}"')
    patient_df.to_json(patient.get_out_file())

#### Convert Controls Data to JSON

In [421]:
meta = pd.read_csv('./out/metadata/controls.csv')

In [449]:
exclude_mislabeled = {
    'JR64', # label on filename
    'GC25', # label on filename
    'ED91', # label on filename
    
    'SW53', # label using pattern topic_label
    'TO27', # label using pattern topic_label
    'AG18', # label using pattern topic_label
    'NF75', # label using pattern topic_label
    'XM81', # label using pattern topic_label
    'PV79', # label using pattern topic_label
    'GB10', # label using pattern topic_label
    'VV19', # label using pattern topic_label
    'TK91', # label using pattern topic_label
    'CV83', # label using pattern topic_label
    'SU94', # label using pattern topic_label
    'KY13', # label using pattern topic_label
    'NC09', # label using pattern topic_label
    'OY45', # label using pattern topic_label
    'GI86', # label using pattern topic_label
}

In [451]:
for index, row in control_df.iterrows():
    patient = Control(row['Patient ID'])
    
    if patient.patient_code not in exclude_error:
        convert_to_json(patient)

        try:
            out = pd.read_json(patient.get_out_file())
            out_labels = out['label'].tolist()
        except:
            print(patient.patient_code)
            raise Exception


        # metadata checker
        query = meta.query(f'`Patient ID` == "{patient.patient_code}"').filter(regex=r'^T\d+', axis=1)
        csv_labels = query.iloc[0].tolist()

        if not out_labels == csv_labels:
            diff_indices = [f'T{i+1}' for i, (x, y) in enumerate(zip(out_labels, csv_labels)) if x != y]
            print(f'{patient.patient_code}', diff_indices)

#### Combine Patients Data

In [195]:
combined_df = pd.DataFrame({})

for fn in sorted(glob.glob('./out/patients/*.json')):
    df = pd.read_json(fn)
    combined_df = pd.concat([df, combined_df], ignore_index=True)

combined_df.to_json('./data/multi_class/patients.json')

#### Combine Controls Data

In [196]:
combined_df = pd.DataFrame({})

for fn in sorted(glob.glob('./out/controls/*.json')):
    df = pd.read_json(fn)
    combined_df = pd.concat([df, combined_df], ignore_index=True)

combined_df.to_json('./data/multi_class/controls.json')

#### Meta Data Patients

In [197]:
combined_df = pd.read_json('./data/multi_class/patients.json')

In [198]:
def count_filler_words(text):
    return len(re.findall('\[.*?\]', str(text)))

In [199]:
def remove_invalid_sentence(text):
    sentences = re.split(r'([.!?]+)', text)
    word_pattern = r'\b\w+\b'

    filtered_sentences = []
    
    for i in range(0, len(sentences), 2):
        sentence = sentences[i].strip()
        punctuation = sentences[i + 1] if i + 1 < len(sentences) else ''
        
        if len(re.findall(word_pattern, sentence)) >= 2:
            filtered_sentences.append(sentence + punctuation + ' ')

    
    return ''.join(filtered_sentences)

In [209]:
def clean_text(text):
    cleaned_text = re.sub(r'\[.*?\]', ' ', str(text))
    cleaned_text = remove_invalid_sentence(cleaned_text)
    cleaned_text = re.sub(r'\s+([.,!?;])', r'\1', cleaned_text)
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s.,!?;]', ' ', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    cleaned_text = cleaned_text.lower()
    cleaned_text = cleaned_text.strip()
    
    return cleaned_text

In [210]:
def extract_metadata(df):
    meta_df = df.copy(deep=True)
    
    meta_df['text'] = meta_df['text'].apply(lambda x: clean_text(x))
    meta_df['filler_words'] = meta_df['text'].apply(lambda x: count_filler_words(x))
    meta_df['num_sentences'] = meta_df['text'].apply(lambda x: len(sent_tokenize(x)))
    meta_df['num_words'] = meta_df['text'].apply(lambda x: len(word_tokenize(x)))
    
    meta_df.drop_duplicates(subset='text', keep=False, inplace=True)
    
    return meta_df

In [211]:
meta_df = extract_metadata(combined_df)

In [212]:
meta_df.to_json('./data/multi_class/patients_clean.json')

#### Meta Data Controls

In [213]:
combined_df = pd.read_json('./data/multi_class/controls.json')

In [214]:
meta_df = extract_metadata(combined_df)

In [215]:
meta_df.to_json('./data/multi_class/controls_clean.json')

#### Convert Patients to binary class

In [216]:
def get_binary_class(label):
    return 1 if label == 1 else 2

In [217]:
combined_df = pd.read_json('./data/multi_class/patients_clean.json')

In [218]:
combined_df['label'] = combined_df['label'].apply(lambda x: get_binary_class(x))

In [219]:
combined_df.to_json('./data/binary_class/patients_clean.json')

#### Convert Controls to binary class

In [221]:
combined_df = pd.read_json('./data/multi_class/controls_clean.json')

In [222]:
combined_df['label'] = combined_df['label'].apply(lambda x: get_binary_class(x))

In [223]:
combined_df.to_json('./data/binary_class/controls_clean.json')