# Reading raw Notebooks

In [2]:
import nbformat

## Read the Notebook

In [3]:
# Read the Notebook
def read_ipynb_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        notebook = nbformat.read(file, as_version=4)
    return notebook

notebook_path = "/Users/nilsjennissen/PycharmProjects/presentations/notebooks/template.ipynb"
notebook = read_ipynb_file(notebook_path)

In [4]:
notebook

{'cells': [{'cell_type': 'markdown',
   'source': '# Document Name',
   'metadata': {'collapsed': False}},
  {'cell_type': 'markdown',
   'source': '## Section 1\nFirst header text',
   'metadata': {'collapsed': False}},
  {'cell_type': 'code',
   'execution_count': None,
   'metadata': {'collapsed': True},
   'outputs': [],
   'source': '# First Code Comment\ndef func():\n    \'\'\'Function description\'\'\'\n    print("Hello World")'},
  {'cell_type': 'markdown',
   'source': '## Section 2',
   'metadata': {'collapsed': False}},
  {'cell_type': 'code',
   'execution_count': None,
   'outputs': [],
   'source': 'def func2():\n    \'\'\'Function description\'\'\'\n    print("Hello World")',
   'metadata': {'collapsed': False}},
  {'cell_type': 'markdown',
   'source': '### Subsection 2.1',
   'metadata': {'collapsed': False}},
  {'cell_type': 'code',
   'execution_count': None,
   'outputs': [],
   'source': 'def func3():\n    \'\'\'Function description\'\'\'\n    print("Hello World")'

## Extracting header and text fields

In [5]:
def extract_text_fields(notebook):
    headers = []
    texts = []

    for cell in notebook.cells:
        if cell.cell_type == 'markdown':
            lines = cell.source.split('\n')
            for line in lines:
                if line.startswith('##'):
                    headers.append(line)
                else:
                    texts.append(line)

    return headers, texts

headers, texts = extract_text_fields(notebook)

In [6]:
headers

['## Section 1',
 '## Section 2',
 '### Subsection 2.1',
 '### Subsection 2.2',
 '## Section 3']

In [7]:
texts

['# Document Name', 'First header text']

## Extracting Sections

In [8]:
def extract_text_fields(notebook):
    sections = []

    for cell in notebook.cells:
        if cell.cell_type == 'markdown':
            lines = cell.source.split('\n')
            section = {'header': '', 'text': ''}
            for line in lines:
                if line.startswith('##'):
                    section['header'] = line
                else:
                    section['text'] += line + '\n'
            sections.append(section)

    return sections

sections = extract_text_fields(notebook)

In [9]:
sections

[{'header': '', 'text': '# Document Name\n'},
 {'header': '## Section 1', 'text': 'First header text\n'},
 {'header': '## Section 2', 'text': ''},
 {'header': '### Subsection 2.1', 'text': ''},
 {'header': '### Subsection 2.2', 'text': ''},
 {'header': '## Section 3', 'text': ''}]