# Reading raw Notebooks

In [2]:
import nbformat
import pandas as pd

## Read the Notebook

In [3]:
# Read the Notebook
def read_ipynb_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        notebook = nbformat.read(file, as_version=4)
    return notebook

notebook_path = "/Users/nilsjennissen/PycharmProjects/presentations/notebooks/template.ipynb"
notebook = read_ipynb_file(notebook_path)

In [4]:
notebook

{'cells': [{'cell_type': 'markdown',
   'source': '# Document Name',
   'metadata': {'collapsed': False}},
  {'cell_type': 'markdown',
   'source': '## Section 1\nFirst header text',
   'metadata': {'collapsed': False}},
  {'cell_type': 'code',
   'execution_count': None,
   'metadata': {'collapsed': True},
   'outputs': [],
   'source': '# First Code Comment\ndef func():\n    \'\'\'Function description\'\'\'\n    print("Hello World")'},
  {'cell_type': 'markdown',
   'source': '## Section 2',
   'metadata': {'collapsed': False}},
  {'cell_type': 'code',
   'execution_count': None,
   'outputs': [],
   'source': 'def func2():\n    \'\'\'Function description\'\'\'\n    print("Hello World")',
   'metadata': {'collapsed': False}},
  {'cell_type': 'markdown',
   'source': '### Subsection 2.1',
   'metadata': {'collapsed': False}},
  {'cell_type': 'code',
   'execution_count': None,
   'outputs': [],
   'source': 'def func3():\n    \'\'\'Function description\'\'\'\n    print("Hello World")'

## Extracting header and text fields

In [9]:
def extract_text_fields(notebook):
    doc_name = []
    headers = []
    subheaders = []
    texts = []

    for cell in notebook.cells:
        if cell.cell_type == 'markdown':
            lines = cell.source.split('\n')
            for line in lines:
                if line.startswith('# '):
                    doc_name.append(line)
                elif line.startswith('## '):
                    headers.append(line)
                elif line.startswith('### '):
                    subheaders.append(line)
                else:
                    texts.append(line)

    return doc_name, headers, subheaders, texts

doc_name, headers, subheaders, texts = extract_text_fields(notebook)

In [10]:
doc_name

['# Document Name']

In [11]:
headers

['## Section 1', '## Section 2', '## Section 3']

In [13]:
subheaders

['### Subsection 2.1', '### Subsection 2.2']

In [12]:
texts

['First header text']

## Extracting Sections

In [14]:
def extract_text_fields(notebook):
    sections = []

    for cell in notebook.cells:
        if cell.cell_type == 'markdown':
            lines = cell.source.split('\n')
            section = {'doc_name': '', 'header': '', 'subheader': '', 'text': ''}
            for line in lines:
                if line.startswith('# '):
                    section['doc_name'] = line
                elif line.startswith('## '):
                    section['header'] = line
                elif line.startswith('### '):
                    section['subheader'] = line
                else:
                    section['text'] += line + '\n'
            sections.append(section)

    return sections

sections = extract_text_fields(notebook)

In [15]:
sections

[{'doc_name': '# Document Name', 'header': '', 'subheader': '', 'text': ''},
 {'doc_name': '',
  'header': '## Section 1',
  'subheader': '',
  'text': 'First header text\n'},
 {'doc_name': '', 'header': '## Section 2', 'subheader': '', 'text': ''},
 {'doc_name': '', 'header': '', 'subheader': '### Subsection 2.1', 'text': ''},
 {'doc_name': '', 'header': '', 'subheader': '### Subsection 2.2', 'text': ''},
 {'doc_name': '', 'header': '## Section 3', 'subheader': '', 'text': ''}]