# core

> Utility class for working with TEI files.

In [45]:
#| default_exp core

In [46]:
#| hide
from nbdev.showdoc import *

In [47]:
#| export
from bs4 import BeautifulSoup
from collections import defaultdict
import pandas as pd
import urllib.request
import os
from typing import Dict

In [48]:
#| export
class TeiUtils:
    """ Utility class for working with TEI files. """

    def __init__(self):
        self.tag_counts = defaultdict(int)
        self.df = None
        self.df_tag = None

    def download(self, url: str, path: str) -> None:
        """ Download a file from a specified URL to a local path.
        
        Args:
            url: The URL from which to download the file.
            path: The local file path to save the downloaded file.
        """
        os.makedirs(os.path.dirname(path), exist_ok=True)
        urllib.request.urlretrieve(url, path)

    def get_tag_freq(self, path: str) -> None:
        """ Read an XML file from a specified path and count the frequency of each tag.
        
        The frequencies are stored in an attribute `tag_counts`.
        A sorted DataFrame of tags and counts is stored in `df` and `df_tag`.
        
        Args:
            path: The file path of the XML file to parse.
        """
        with open(path, "r") as f:
            xml = f.read()

        soup = BeautifulSoup(xml, 'lxml-xml')

        for tag in soup.find_all(True):
            self.tag_counts[tag.name] += 1

        self.df = pd.DataFrame(list(self.tag_counts.items()), columns=["Tag", "Count"])
        self.df.sort_values(by="Count", ascending=False, inplace=True)

        self.df_tag = self.df.sort_values(by="Tag", ascending=True)

In [49]:
show_doc(TeiUtils.download)

---

### TeiUtils.download

>      TeiUtils.download (url:str, path:str)

Download a file from a specified URL to a local path.

Args:
    url: The URL from which to download the file.
    path: The local file path to save the downloaded file.

In [50]:
show_doc(TeiUtils.get_tag_freq)

---

### TeiUtils.get_tag_freq

>      TeiUtils.get_tag_freq (path:str)

Read an XML file from a specified path and count the frequency of each tag.

The frequencies are stored in an attribute `tag_counts`.
A sorted DataFrame of tags and counts is stored in `df` and `df_tag`.

Args:
    path: The file path of the XML file to parse.

In [51]:
#| hide
import nbdev; nbdev.nbdev_export()