## How to Convert HTML to .ipynb
This is the code example used for the blog post [https://www.marsja.se/converting-html-to-a-jupyter-notebook/](https://www.marsja.se/converting-html-to-a-jupyter-notebook/) in which we learn how to convert code chunks from a webpage to a Jupyter notebook.

In [12]:
import json
from urllib import request

from bs4 import BeautifulSoup

url = 'https://www.marsja.se/python-manova-made-easy-using-statsmodels/'

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11''(KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    'Accept-Encoding': 'none',
    'Accept-Language': 'en-US,en;q=0.8',
    'Connection': 'keep-alive'}

In [13]:
req = request.Request(url, headers=headers)
page = request.urlopen(req)
text = page.read()

In [14]:
soup = BeautifulSoup(text, 'lxml')

In [15]:
create_nb = {'nbformat': 4, 'nbformat_minor': 2,
             'cells': [], 'metadata':
                 {"kernelspec":
                      {"display_name": "Python 3",
                       "language": "python", "name": "python3"
                       }}}


def get_data(bs, content_tag, content_class=None):
    if content_class:
        bs_all = soup.find_all(content_tag, attrs={'class': content_class})
    else:
        bs_all = soup.find_all(content_tag)

    for div in bs_all:

        code_chunks = div.find_all('code')

        for chunk in code_chunks:
            cell_text = ' '
            cell = {'metadata': {},
                    'outputs': [],
                    'source': [chunk.get_text()],
                    'execution_count': None,
                    'cell_type': 'code'
                    }
            create_nb['cells'].append(cell)


get_data(soup, 'article')

with open('Python_MANOVA.ipynb', 'w') as jynotebook:
    jynotebook.write(json.dumps(create_nb))

In [16]:
create_nb

{'nbformat': 4,
 'nbformat_minor': 2,
 'cells': [{'metadata': {},
   'outputs': [],
   'source': ['pip install statsmodels'],
   'execution_count': None,
   'cell_type': 'code'},
  {'metadata': {},
   'outputs': [],
   'source': ['import pandas as pd\nfrom statsmodels.multivariate.manova import MANOVA'],
   'execution_count': None,
   'cell_type': 'code'},
  {'metadata': {},
   'outputs': [],
   'source': ['url = \'https://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv\'\ndf = pd.read_csv(url, index_col=0)\ndf.columns = df.columns.str.replace(".", "_")\ndf.head()'],
   'execution_count': None,
   'cell_type': 'code'},
  {'metadata': {},
   'outputs': [],
   'source': ["maov = MANOVA.from_formula('Sepal_Length + Sepal_Width + \\\n                            Petal_Length + Petal_Width  ~ Species', data=df)"],
   'execution_count': None,
   'cell_type': 'code'},
  {'metadata': {},
   'outputs': [],
   'source': ['print(maov.mv_test())'],
   'execution_count': None,
   'cell_