In [2]:
magma_dir = '/home/marco/epfl/magma/'

In [3]:
new_data_dir = magma_dir + 'corpora/Karger Fast Facts books/72 xml/'

### **Config**

In [4]:
import sys
sys.path.insert(0, magma_dir)
import config

# Output path
OUTPUT_PATH = magma_dir+'datasets/karger_books_base/'

### **Init**

In [5]:
import matplotlib.pyplot as plt
import numpy as np
import re
import pandas as pd
import sys
import gensim

## **Karger Books**

##### Class definition

In [7]:
class KargerBook():
    def __init__(self, title, abbreviations, glossary, introduction, chapter):
        self.title = title
        self.abbreviations = abbreviations
        self.glossary = glossary
        self.introduction = introduction
        self.chapters = chapter
        self.id = 0

    def set_id(self, id):
        self.id = int(id)
    def get_id(self):
        return self.id

    def get_title(self):
        if self.title is None:
            print("error: title is None")
            return None
        return self.title

    def get_abbreviations(self):
        if len(self.abbreviations) < 1:
            print("error: no abbreviations")
            return None
        return self.abbreviations
    
    def get_glossary(self):
        if self.glossary is None:
            print("error: glossary is None")
            return None
        return self.glossary
    
    def get_introduction(self):
        if self.introduction is None:
            print("error: introduction is None")
            return None
        return self.introduction
    
    def get_chapters(self):
        if self.chapters is None:
            print("error: chapters is None")
            return None
            
        new_dic = {}
        for k, v in self.chapters.items():
            new_dic[k] = {}
            for sec, text in v.items():
                if sec.startswith('ch') and not sec.endswith('bullets'):
                    new_dic[k][sec] = {}
                    new_dic[k][sec][sec] = text
                if (sec.startswith('sec') or re.match('\d+\.\w+', sec))\
                    and ('sub' not in sec) and ('bullets' not in sec):
                    new_dic[k][sec] = {}
                    new_dic[k][sec][sec] = text
                    for sec_key in v.keys():
                        if sec_key.endswith('sub_'+sec) or sec_key.endswith('sub_'+sec+'_bullets') or sec+'_bullets' in sec_key:
                            if v[sec_key] != []:
                                new_dic[k][sec][sec_key] = v[sec_key]
            if not new_dic[k]:
                new_dic.pop(k)

        return new_dic

    def get_dataframe(self):
        cpt = self.get_chapters()
        if cpt is None:
            print("error: cannot create dataframe if chapters is None")
            return None
        cpt_df = pd.DataFrame.from_dict({(i,j,k): cpt[i][j][k] 
                for i in cpt.keys() 
                for j in cpt[i].keys()
                for k in cpt[i][j].keys()},
            orient='index')
        
        idx_bullets = [idx for idx in list(cpt_df.index) if 'bullets' in idx[2]]
        idx_sec_bullets = [(idx[0], idx[1], idx[2].replace('_bullets', '')) for idx in idx_bullets]
        mapper = {}
        for i, b in enumerate(idx_bullets):
            mapper[b] = idx_sec_bullets[i]
        
        cpt_df_bullets = cpt_df.loc[idx_bullets].rename(index=mapper)
        cpt_df_nobullets = cpt_df.drop(labels = idx_bullets, axis=0)
        cpt_df_new = cpt_df_nobullets.join(cpt_df_bullets, rsuffix='_bullets').rename(columns={'0': 'text', '0_bullets': 'bullets'})
        
        cpt_df_new.index = pd.MultiIndex.from_tuples(cpt_df_new.index)
        cpt_df_new.index.names = ['chapter', 'section', 'subsection']
        
        for idx, df in cpt_df_new.dropna().groupby('chapter'):
            cpt_df_new.loc[(idx, slice(None)), 'bullets'] = \
                cpt_df_new.loc[(idx, slice(None)), 'bullets'].map(lambda x:
                    df.iloc[0].bullets)
        
        return cpt_df_new

##### Helper functions

In [8]:
i = 0
def get_i():
    return i
def set_i(n):
    global i
    i = n

def print_dict(d, prefix):
    for k in d.keys():
        print(prefix+k)
        if type(d[k]) is dict:
            print_dict(d[k], prefix+'\t')
        else:
            print(prefix+'- text')

def parse_abbreviations(abbr):
    abbreviations = {}
    for abb in abbr.iter():
        if abb.tag.endswith('title'):
            if 'abbreviations' not in [c for c in abb.itertext()][0].lower():
                print('Warning: suspicious abbreviations section in this file.')
        if abb.tag.endswith('para'):
            abb_key_value = [c.strip(':').strip() for c in abb.itertext()]
            abb_key_value = [a for a in abb_key_value if len(a) > 0]
            if len(abb_key_value) < 2:
                abb_key_value = [c.strip() for c in re.split(':|=', abb_key_value[0])]
            abbreviations[abb_key_value[0]] = abb_key_value[1]
    return abbreviations

def parse_glossary(gloss):
    glossary = {}
    for glos in gloss.iter():
        if glos.tag.endswith('title'):
            if 'glossary' not in [c for c in glos.itertext()][0].lower():
                print('Warning: suspicious glossary section in this file.')
        if glos.tag.endswith('para'):
            glos_key_value = [c.strip(':').strip() for c in glos.itertext()]
            if len(glos_key_value) < 2 : continue
            glossary[glos_key_value[0]] = glos_key_value[1]
    return glossary

def parse_text(text):
    parsed_text = [c for c in text.itertext() if not re.fullmatch('\d+|(\d+,\d+)|(\d+–\d+)', c)\
        and not bool(re.match('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', c))]
    if len(parsed_text) < 1:
        return ''
    parsed_text = parsed_text[1:] if parsed_text[0] == '\n' else parsed_text
    parsed_text = parsed_text[:-1] if parsed_text[-1] == '\n' else parsed_text
    parsed_text = ' '.join(parsed_text)
    parsed_text = ''.join([re.sub(' +', ' ', line.strip()+'\n') for line in parsed_text.splitlines()])
    parsed_text = re.sub(r'\s([?,;:.!"](?:\s|$))', r'\1', parsed_text)
    
    return parsed_text

def search_bullets(ch_root):
    bullets = []
    found_key_points = False
    for ch in ch_root.iter():
        if 'link' in ch.tag : continue
        if found_key_points and 'itemized' in ch.tag:
            for item in ch:
                bullets.append(parse_text(item))

        if 'title' in ch.tag:
            t = [c for c in ch.itertext()]
            if len(t) > 0 : t = t[0].lower()
            if 'key' in t and 'point' in t:
                found_key_points = True
        if 'para' in ch.tag:
            t = [c for c in ch.itertext()]
            if len(t) > 0 : t = t[0].lower()
            if len(t) < 100 and 'key' in t and 'point' in t:
                found_key_points = True
                
    return bullets

def parse_chapter(ch_root, chapter = {}, subsect = ''):
    t = list(ch_root.attrib.values())
    if len(t) < 1:
        ch_title = 'sec'+str(i)
        set_i(i+1)
    else:
        ch_title = t[0]+subsect
    chapter[ch_title] = []

    for ch in ch_root:
        if ch.tag.endswith('title'):
            t = [c for c in ch.itertext()]
            if len(t) > 0:
                t = t[0] + '\n'
            else:
                t = ''
            if 'key' in t.lower() and\
                ('point' in t.lower() or 'reference' in t.lower()):
                break
            chapter[ch_title].append(t)
        elif 'itemized' in ch.tag:
            for item in ch:
                chapter[ch_title].append(parse_text(item))
        elif 'para' in ch.tag:
            chapter[ch_title].append(parse_text(ch))
        elif 'sect1' in ch.tag:
            chapter = parse_chapter(ch, chapter)
        elif 'sect2' in ch.tag:
            chapter = parse_chapter(ch, chapter, '_sub_'+ch_title)
        elif 'section' in ch.tag:
            chapter = parse_chapter(ch, chapter)

    chapter[ch_title] = ''.join(chapter[ch_title])

    bullets = search_bullets(ch_root)
    if len(bullets) > 0:
        chapter[ch_title + '_bullets'] = bullets
    return chapter

def parse_tree(root):
    title = None
    abbreviations = []
    glossary = None
    introduction = None
    chapters = {}

    for chapt in root:
        if (title is None) and 'title' in chapt.tag and\
            ('subtitle' not in chapt.tag) and\
            ('jobtitle' not in chapt.tag):
            
            title = chapt.text
            if title is None:
                title = [c for c in chapt.itertext()][0]

        if (title is None) and 'info' in chapt.tag:
            for subc in chapt.iter():
                if ('title' in subc.tag) and\
                    ('subtitle' not in subc.tag) and\
                    ('jobtitle' not in subc.tag):
                    
                    title = subc.text

        elif chapt.tag.endswith('chapter'):
            for ch in chapt:
                if ch.tag.endswith('title'):
                    t = [c for c in ch.itertext()]
                    if len(t) > 0:
                        t = t[0].lower()
                    else:
                        t = ''
                    break
            if 'abbr' in t:
                abbreviations.append(parse_abbreviations(chapt))
                continue
            elif 'gloss' in t:
                glossary = parse_glossary(chapt)
                continue
            elif 'intro' in t:
                introduction = parse_text(next(chapt.iter()))
                continue
            if 'app' in list(chapt.attrib.values())[0] : continue

            set_i(0)
            chapters[list(chapt.attrib.values())[0]] = parse_chapter(chapt, {}, '')
            
        elif chapt.tag.endswith('preface'):
            for subc in chapt.iter():
                if subc.tag.endswith('title'):
                    pref_title = [c for c in subc.itertext()][0].lower()
                    if 'abbr' in pref_title:
                        abbreviations.append(parse_abbreviations(chapt))
                    elif 'gloss' in pref_title:
                        glossary = parse_glossary(chapt)
                    elif 'intro' in pref_title:
                        introduction = parse_text(next(chapt.iter()))
                        
    return KargerBook(title, abbreviations, glossary, introduction, chapters)

##### Preparing the data

In [14]:
import os
from os.path import isdir, join

XML_DATA_DIR = new_data_dir+'77 FF'

xml_files = [join(XML_DATA_DIR, f) for f in os.listdir(XML_DATA_DIR) if f.endswith('.xml')]
xml_files.sort()
print(xml_files)

['/content/drive/My Drive/MAGMA: Summarization/corpora/Karger Fast Facts books/72 xml/77 FF/9781905832729.xml', '/content/drive/My Drive/MAGMA: Summarization/corpora/Karger Fast Facts books/72 xml/77 FF/9781908541024.xml', '/content/drive/My Drive/MAGMA: Summarization/corpora/Karger Fast Facts books/72 xml/77 FF/9781908541062.xml', '/content/drive/My Drive/MAGMA: Summarization/corpora/Karger Fast Facts books/72 xml/77 FF/9781908541086.xml', '/content/drive/My Drive/MAGMA: Summarization/corpora/Karger Fast Facts books/72 xml/77 FF/9781908541178.xml', '/content/drive/My Drive/MAGMA: Summarization/corpora/Karger Fast Facts books/72 xml/77 FF/9781908541277.xml', '/content/drive/My Drive/MAGMA: Summarization/corpora/Karger Fast Facts books/72 xml/77 FF/9781908541406.xml', '/content/drive/My Drive/MAGMA: Summarization/corpora/Karger Fast Facts books/72 xml/77 FF/9781908541420.xml', '/content/drive/My Drive/MAGMA: Summarization/corpora/Karger Fast Facts books/72 xml/77 FF/9781908541437.xml', 

In [15]:
import xml.etree.ElementTree as ET
import sys

# the old books are already present in this new files
books = []

for kb_file in xml_files:
    print('Parsing', kb_file)
    if ('9781912776849' in kb_file) or\
        ('9781912776962' in kb_file) or\
        ('9781912776177' in kb_file):
        print('Skipping German book\n')
        continue
    if ('9781912776900' in kb_file) or\
        ('9783318066302' in kb_file) or\
        ('9781912776788' in kb_file):
        print('Skipping Italian book\n')
        continue
    if '9781912776931' in kb_file:
        print('Skipping French book\n')
        continue
    if ('9783318067637' in kb_file) or\
        ('9783318067606' in kb_file) or\
        ('9783318066371' in kb_file) or\
        ('9783318066159' in kb_file) or\
        ('9783318065640' in kb_file) or\
        ('9781912776993' in kb_file) or\
        ('9781912776870' in kb_file) or\
        ('9783318065848' in kb_file):
        print('Skipping Spanish book\n')
        continue
    if ('9781910797938' in kb_file) or\
        ('9781912776108' in kb_file) or\
        ('9781912776382' in kb_file) or\
        ('9781912776665' in kb_file) or\
        ('9783318065435' in kb_file):
        print('Skipping no bullets book\n')
        continue
        
    b = parse_tree(ET.parse(kb_file).getroot())
    b.set_id(kb_file[-17:-4])
    books.append(b)
    print()

Parsing /content/drive/My Drive/MAGMA: Summarization/corpora/Karger Fast Facts books/72 xml/77 FF/9781905832729.xml

Parsing /content/drive/My Drive/MAGMA: Summarization/corpora/Karger Fast Facts books/72 xml/77 FF/9781908541024.xml

Parsing /content/drive/My Drive/MAGMA: Summarization/corpora/Karger Fast Facts books/72 xml/77 FF/9781908541062.xml

Parsing /content/drive/My Drive/MAGMA: Summarization/corpora/Karger Fast Facts books/72 xml/77 FF/9781908541086.xml

Parsing /content/drive/My Drive/MAGMA: Summarization/corpora/Karger Fast Facts books/72 xml/77 FF/9781908541178.xml

Parsing /content/drive/My Drive/MAGMA: Summarization/corpora/Karger Fast Facts books/72 xml/77 FF/9781908541277.xml

Parsing /content/drive/My Drive/MAGMA: Summarization/corpora/Karger Fast Facts books/72 xml/77 FF/9781908541406.xml

Parsing /content/drive/My Drive/MAGMA: Summarization/corpora/Karger Fast Facts books/72 xml/77 FF/9781908541420.xml

Parsing /content/drive/My Drive/MAGMA: Summarization/corpora/Kar

## **Base Dataset**

In [16]:
dfs = []
keys = []
for b in books:
    dfs.append(b.get_dataframe())
    keys.append(b.get_id())
df = pd.concat(dfs, keys=keys, names=['book'])

### **Init and Preprocessing**

#### Preparing the data

* Remove unwanted spaces around brackets
* Replace special characters
* Replace abbreviations if REPLACE_ABBR

In [17]:
# Remove unwanted spaces around brackets
if config.REMOVE_PAR:
    df.text = df.text.map(lambda t: re.sub(r'\([^)]*\)', '', t))
else:
    df.text = df.text.map(lambda t: re.sub('\s*\(\s+', ' (', t))
    df.text = df.text.map(lambda t: re.sub('\s+\)', ')', t))

In [18]:
def remove_special_characters(df, col):
    def replace_string_or_list(x, s, sub):
        if isinstance(x, list):
            return [e.replace(s, sub) for e in x]
        else:
            return x.replace(s, sub)

    special_char = []
    sub_char = []
    with open(config.SPECIAL_CHAR_FILE, 'r') as scf:
        i = 0
        for l in scf.read().splitlines():
            if i % 2 == 0:
                special_char.append(l)
            else:
                sub_char.append(l)
            i+=1
    print(special_char)
    print(sub_char)

    for s, sub in zip(special_char, sub_char):
        df[col] = df[col].map(lambda x: replace_string_or_list(x, s, sub),
            na_action='ignore')
    
    return df

df = remove_special_characters(df, 'text')
df = remove_special_characters(df, 'bullets')

['i.e.', 'e.g.', '–', '•', '‘', '’', '±', '≤', '≥', '×', '~', 'α', 'β', 'γ', 'ζ', '&']
['id est', 'exempli gratia', '-', '-', "'", "'", '+-', '<=', '>=', 'x', ' around ', 'alpha', 'beta', 'gamma', 'zed', 'and']
['i.e.', 'e.g.', '–', '•', '‘', '’', '±', '≤', '≥', '×', '~', 'α', 'β', 'γ', 'ζ', '&']
['id est', 'exempli gratia', '-', '-', "'", "'", '+-', '<=', '>=', 'x', ' around ', 'alpha', 'beta', 'gamma', 'zed', 'and']


In [19]:
# Remove new lines from bullet points
df.bullets = df.bullets.map(lambda l:
    [b.replace('\n', '') for b in l], na_action='ignore')

In [20]:
# Replace abbreviations in text
if config.REPLACE_ABBR:
    abbreviations =\
        [books[bid].get_abbreviations() for bid in BOOK_IDS]
    for bid, abbr in enumerate(abbreviations):
        if abbreviations is not None:
            df.loc[bid].text = \
                df.loc[bid].text.replace(abbr, regex=True)

### **Removing books with duplicate bullets**

* 9781908541437 (Fast Facts: Cardiac Arrhythmias 2nd edition) $\leftrightarrow$ 9781912776153 (Fast Facts: Cardiac Arrhythmias 3rd edition) **22 bullets in common**
* 9781908541703 (Fast Facts: Breast Cancer 5th edition) $\leftrightarrow$ 9781910797242 (Fast Facts: Early Breast Cancer) **13 bullets in common**
* 9781908541796 (Fast Facts: Lymphoma 2nd edition) $\leftrightarrow$ 9781910797969 (Fast Facts: Lymphoma) **27 bullets in common**
* 9781910797457 (Fast Facts: Prostate Cancer 9th edition) $\leftrightarrow$ 9783318065893 (Fast Facts: Prostate Cancer) **16 bullets in common**
* 9781910797693 (Fast Facts: Biosimilars) $\leftrightarrow$ 9781912776238 (Fast Facts: Biosimilars in Hematology and Oncology) **1 bullet in common**

So, **we remove**:

* 9781908541437
* 9781910797242
* 9781910797969
* 9783318065893
* one chapter of 9781912776238

In [21]:
df_expl = df.explode('bullets')
all_bul = df_expl.bullets.unique()

books_with_duplicates = []
chapters_with_duplicates = []
for b in all_bul:
    idx = df_expl[df_expl['bullets'] == b].index
    book = idx.get_level_values('book')
    cpt = idx.get_level_values('chapter')
    idx = sorted(list(set(zip(book, cpt))))
    if len(idx) > 1:
        books_with_duplicates += list(set(book))
        chapters_with_duplicates += idx
        print(b)
        print(idx)
        print()

Cardiac cells have the unique ability to depolarize rhythmically; depolarization normally occurs in one direction from the top down, from atria to ventricles.
[(9781908541437, 'ch_3'), (9781912776153, 'chp1')]

The AV node has a decremental slowing effect on conduction.
[(9781908541437, 'ch_3'), (9781912776153, 'chp1')]

Autonomic effects on the heart (sympathetic and parasympathetic stimuli) can significantly influence cardiac conduction.
[(9781908541437, 'ch_3'), (9781912776153, 'chp1')]

Re-entry is the most frequent mechanism for arrhythmias.
[(9781908541437, 'ch_4'), (9781912776153, 'chp1')]

The substrate for an arrhythmia is an abnormal electrical pathway or a region of scarred myocardium. The trigger is an atrial or ventricular ectopic beat.
[(9781908541437, 'ch_4'), (9781912776153, 'chp1')]

Most supraventricular tachycardias have narrow QRS complexes.
[(9781908541437, 'ch_4'), (9781912776153, 'chp1')]

Patients with arrhythmias can present in a variety of ways.
[(978190854143

In [22]:
from collections import Counter
dict(Counter(books_with_duplicates))

{9781908541437: 22,
 9781908541703: 13,
 9781908541796: 27,
 9781910797242: 13,
 9781910797457: 16,
 9781910797693: 1,
 9781910797969: 27,
 9781912776153: 22,
 9781912776238: 1,
 9783318065893: 16}

In [23]:
dict(Counter(chapters_with_duplicates))

{(9781908541437, 'ch_10'): 3,
 (9781908541437, 'ch_11'): 1,
 (9781908541437, 'ch_12'): 1,
 (9781908541437, 'ch_13'): 4,
 (9781908541437, 'ch_3'): 3,
 (9781908541437, 'ch_4'): 3,
 (9781908541437, 'ch_5'): 1,
 (9781908541437, 'ch_6'): 1,
 (9781908541437, 'ch_7'): 2,
 (9781908541437, 'ch_9'): 3,
 (9781908541703, 'ch_2'): 3,
 (9781908541703, 'ch_5'): 4,
 (9781908541703, 'ch_6'): 1,
 (9781908541703, 'ch_7'): 1,
 (9781908541703, 'ch_8'): 4,
 (9781908541796, 'chapter1'): 1,
 (9781908541796, 'chapter10'): 5,
 (9781908541796, 'chapter11'): 4,
 (9781908541796, 'chapter2'): 1,
 (9781908541796, 'chapter3'): 3,
 (9781908541796, 'chapter4'): 1,
 (9781908541796, 'chapter6'): 2,
 (9781908541796, 'chapter7'): 3,
 (9781908541796, 'chapter8'): 4,
 (9781908541796, 'chapter9'): 3,
 (9781910797242, 'chp1'): 3,
 (9781910797242, 'chp2'): 4,
 (9781910797242, 'chp4'): 1,
 (9781910797242, 'chp5'): 1,
 (9781910797242, 'chp6'): 4,
 (9781910797457, 'chp1'): 2,
 (9781910797457, 'chp3'): 2,
 (9781910797457, 'chp4'): 

In [24]:
df = df.drop(labels=[9781908541437, 9781910797242, 9781910797969, 9783318065893, (9781912776238, 'ch3')])

  return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)


Double check

In [25]:
df_expl = df.explode('bullets')
all_bul = df_expl.bullets.unique()

books_with_duplicates = []
chapters_with_duplicates = []
for b in all_bul:
    idx = df_expl[df_expl['bullets'] == b].index
    book = idx.get_level_values('book')
    cpt = idx.get_level_values('chapter')
    idx = sorted(list(set(zip(book, cpt))))
    if len(idx) > 1:
        books_with_duplicates += list(set(book))
        chapters_with_duplicates += idx
        print(b)
        print(idx)
        print()

### **Save dataset**

In [26]:
df.to_csv(OUTPUT_PATH+'df.csv')

In [27]:
drive.flush_and_unmount()

### **Create train, test, validation**

In [16]:
df = pd.read_csv(OUTPUT_PATH+'df.csv').set_index(['book', 'chapter', 'section', 'subsection'])
df.bullets = df.bullets.map(eval, na_action='ignore')

In [18]:
RE_SPLITTER = '\n'              # do we split sentences of paragraphs?
                                # use '\.(?!\d)|\n' or '\n', respectively

TOKEN_MAX_LEN = 99              # max length of a word
PARA_MIN_LENGTH = 2             # minimum length for a sentence or
                                # a paragraph, in tokens

#### Preprocessing

* Split based on RE_SPLITTER
* Explode the dataset
* Remove unwanted chars at beginning or end of sentence
* Remove multiple spaces
* Remove long words (> TOKEN_MAX_LEN chars)
* Remove short sentences / paragraphs (< PARA_MIN_LENGTH tokens)

In [19]:
# Split in sentences / paragraphs based on RE_SPLITTER
df.text =\
    df.text.map(lambda x: [p.strip() for p in re.split(RE_SPLITTER, x) if p!=''],
                na_action='ignore')
    
# explode to get one row for each paragraph /sentence
df = df.explode('text')
df = df.rename(columns={'text': 'para'})
df = df.dropna()

# Remove unwanted chars at beginning or end of sentence
df.para = df.para.map(lambda p: p.lstrip(',;-)] \n'))
df.para = df.para.map(lambda p: p.rstrip(',;-([ \n'))

# Remove multiple spaces
df.para = df.para.map(lambda p:
    re.sub('\s+', ' ', p).strip())

# Remove long words (> TOKEN_MAX_LEN chars)
def para2words(para):
    return gensim.utils.simple_preprocess(
        para, deacc=True, max_len=TOKEN_MAX_LEN)
df['para_proc'] = df.para.map(para2words)

# Remove short sentences / paragraphs (< PARA_MIN_LENGTH tokens)
df.loc[df.para_proc.map(len) <\
    PARA_MIN_LENGTH, 'para_proc'] = np.nan

df = df.dropna()

In [20]:
df = df.groupby(['book', 'chapter'], sort=False).agg({
    'para': lambda p: '\n'.join(list(p)),
    'bullets': lambda b: ' <BULL> '.join(list(b)[0])
}).rename(columns={'para': 'text'})
df.bullets = df.bullets.map(lambda b: '<BULL> '+b)

In [22]:
df.iloc[0].bullets

'<BULL> The bladder operates as a low-pressure high-volume system. <BULL> Urine storage and voiding are controlled by reflex centers in the spinal cord, the micturition center in the midbrain and the somatic and parasympathetic nervous systems. <BULL> Voiding requires a coordinated contraction of the detrusor (bladder smooth musculature), simultaneous opening of the bladder outlet (involuntary) and relaxation of the external urethral sphincter (voluntary). Continence requires the converse.'

#### Save

In [23]:
df = df.sample(frac=1, random_state=config.SEED)

In [24]:
train, val, test =\
    df.iloc[:int(len(df)*0.8)], df.iloc[int(len(df)*0.8):int(len(df)*0.9)], df.iloc[int(len(df)*0.9):]

In [25]:
print(len(train), len(val), len(test))

362 45 46


In [26]:
train.to_csv(OUTPUT_PATH+'train.csv')
val.to_csv(OUTPUT_PATH+'val.csv')
test.to_csv(OUTPUT_PATH+'test.csv')