# Statistiques de la collection INEX pour le projet de RI


## imports

In [None]:
import os
import sys
import xml.dom.minidom as minidom
import matplotlib.pyplot as plt

from collections import Counter
from tqdm import tqdm
from typing import List, Tuple

## Chargement de la collection depuis google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def fetch_articles(dir_collection:str) -> list[tuple[str, minidom.Document]]:
    xml_files = [f for f in os.listdir(dir_collection) if f.lower().endswith('.xml')]
    articles = []
    for xml_file in tqdm(xml_files, desc="loading xml files ..."):
        file_path = os.path.join(dir_collection, xml_file)
        id = xml_file.split('.')[0]
        dom = minidom.parse(file_path)
        articles += [(id, dom)]

    return articles


In [None]:
dir_collection = "/content/drive/MyDrive/data/RI/Practice_05_data/XML-Coll-withSem"

In [None]:
collection = fetch_articles(dir_collection)

## Fonction d'affichage des fréquences

In [None]:
def plot_tag_frequency(tag_counter: Counter, n: int = 10):
    """
    Display a horizontal barplot of the top n most frequent tags.

    Args:
        tag_counter (Counter): A Counter object containing the frequency of tags.
        n (int, optional): The number of tags to display. Default is 10.
    """
    top_tags = tag_counter.most_common(n)

    # Create a horizontal barplot
    tags, frequencies = zip(*top_tags)
    plt.barh(tags, frequencies, color='skyblue')
    plt.xlabel('Frequency')
    plt.title(f'Top {n} Most Frequent Tags in the XML Collection')
    plt.show()

In [None]:
def plot_tags_in_documents(tag_document_count: Counter, n: int = 10):
    """
    Display a horizontal barplot of the number of documents each tag appears in.

    Args:
        tag_document_count (Counter): A Counter object containing the number of documents each tag appears in.
        n (int, optional): The number of tags to display. Default is 10.
    """
    # Select the top n tags with the highest number of documents
    top_tags = tag_document_count.most_common(n)

    # Create a horizontal barplot
    tags, document_counts = zip(*top_tags)
    plt.barh(tags, document_counts, color='lightcoral')
    plt.xlabel('Number of Documents')
    plt.title(f'Top {n} Tags by Number of Documents in the XML Collection')
    plt.show()

## Fréquence totale des tags dans la collection

In [None]:
def calculate_tag_frequency(articles: list[tuple[str, minidom.Document]]) -> Counter:
    """
    Calculate the frequency of tags in a collection of XML articles.

    Args:
        articles (List[Tuple[str, minidom.Document]]): A list of articles where each article is represented by
                                                       a tuple (id, XML document).

    Returns:
        Counter: A Counter object containing the frequency of tags.
    """
    tag_counter = Counter()

    for article_id, dom in articles:
        tags = [node.tagName for node in dom.getElementsByTagName('*')]
        tag_counter.update(tags)

    return tag_counter

In [None]:
cf_tags = calculate_tag_frequency(collection)

In [None]:
plot_tag_frequency(cf_tags, n=10)

## Fréquence des tags par document

In [None]:
def count_tags_in_documents(articles: list[tuple[str, minidom.Document]]) -> Counter:
    """
    Count how many documents in the collection each tag appears in.

    Args:
        articles (List[Tuple[str, minidom.Document]]): A list of articles where each article is represented by
                                                       a tuple (id, XML document).

    Returns:
        Counter: A Counter object containing the number of documents each tag appears in.
    """
    tag_document_count = Counter()
    tag_documents = {}

    for article_id, dom in tqdm(articles, "count tags in documents..."):
        tags = set(node.tagName for node in dom.getElementsByTagName('*'))
        for tag in tags:
          if tag in tag_documents:
            tag_documents[tag].add(article_id)
          else:
            tag_documents[tag] = set(article_id)

    for tag, documents in tag_documents.items():
        tag_document_count[tag] = len(documents)

    return tag_document_count

In [None]:
c_tags = count_tags_in_documents(collection)

In [None]:
plot_tags_in_documents(c_tags, n=25)

## Longueur moyenne des documents

In [None]:
import nltk
nltk.download('punkt')

In [None]:
def _extract_text(node):
    """
    Extract recursively the text content of all children tags of a tag.
    Args:
    - node: a minidom.Node object.

    Returns:
    -the extracted text content as a raw string.
    """
    text = ""

    if node.nodeType == minidom.Node.ELEMENT_NODE:
        for child_node in node.childNodes:
            text += _extract_text(child_node)

    elif node.nodeType == minidom.Node.TEXT_NODE:
        text += node.nodeValue.strip()

    return text

In [None]:
def count_words_in_articles(articles: List[Tuple[str, minidom.Document]]) -> int:
    """
    Count the number of words in the content of each <article> tag in the XML documents.

    Args:
        articles (List[Tuple[str, minidom.Document]]): A list of articles where each article is represented by
                                                        a tuple (id, XML document).

    Returns:
        int: The total number of words in the content of all <article> tags.
    """
    total_words = 0

    for _, dom in tqdm(articles, "count words by article"):
        article_tags = dom.getElementsByTagName('article')

        for article_tag in article_tags:
            article_content = _extract_text(article_tag)

            words = nltk.word_tokenize(article_content)

            total_words += len(words)

    return total_words

In [None]:
doc_avg_len = count_words_in_articles(collection)

In [None]:
n = len(collection)
print(f'Nombre moyen de mots par documents : {round(doc_avg_len / n)}')

## Distribution des longueurs des documents

In [None]:
def calculate_document_lengths(articles: list[tuple[str, minidom.Document]]) -> List[int]:
    """
    Calculate the lengths of documents in the content of each <article> tag in the XML documents.

    Args:
        articles (List[Tuple[str, minidom.Document]]): A list of articles where each article is represented by
                                                        a tuple (id, XML document).

    Returns:
        List[int]: A list containing the lengths of documents (number of characters) in the content of all <article> tags.
    """
    document_lengths = []

    for _, dom in tqdm(articles, "calculate document lengths"):
        article_tags = dom.getElementsByTagName('article')

        for article_tag in article_tags:
            article_content = _extract_text(article_tag)
            document_lengths.append(len(article_content))

    return document_lengths

In [None]:
def plot_document_length_distribution(document_lengths: List[int]):
    """
    Plot the distribution of document lengths in the collection.

    Args:
        document_lengths (List[int]): A list containing the lengths of each document in the collection.
    """
    # Create a histogram of document lengths
    plt.hist(document_lengths, bins=20, color='lightgreen', edgecolor='black')
    plt.xlabel('Document Length (Number of Characters)')
    plt.ylabel('Number of Documents')
    plt.title('Distribution of Document Lengths in the Collection')
    plt.show()

In [None]:
docs_len = calculate_document_lengths(collection)

In [None]:
plot_document_length_distribution(docs_len)

## Nombre moyen de balises par document

In [None]:
def calculate_average_tags_per_document(articles: list[tuple[str, minidom.Document]]) -> float:
    """
    Calculate the average number of XML tags per document in a collection of articles.

    Args:
        articles (List[Tuple[str, minidom.Document]]): A list of articles where each article is represented by
                                                       a tuple (id, XML document).

    Returns:
        float: The average number of XML tags per document.
    """
    total_tags_count = 0
    total_document_count = len(articles)

    for article_id, dom in tqdm(articles, "calculating average tags per document..."):
        tags_count = len(dom.getElementsByTagName('*'))
        total_tags_count += tags_count

    # Calculate the average number of tags per document
    if total_document_count > 0:
        average_tags_per_document = total_tags_count / total_document_count
        return average_tags_per_document
    else:
        return 0.0

In [None]:
count_avg_tag_per_doc = calculate_average_tags_per_document(collection)

In [None]:
print(f'Nombre moyen de balise par document {round(count_avg_tag_per_doc)}')

## Profondeur moyenne des balises XML

In [None]:
def calculate_average_tag_depth(articles: list[tuple[str, minidom.Document]]) -> float:
    """
    Calculate the average tag depth per document in a collection of articles.

    Args:
        articles (List[Tuple[str, minidom.Document]]): A list of articles where each article is represented by
                                                       a tuple (id, XML document).

    Returns:
        float: The average tag depth per document.
    """
    total_depth = 0
    total_document_count = len(articles)

    for article_id, dom in tqdm(articles, "calculating average tag depth..."):
        tag_depths = [node.parentNode.childNodes.index(node) for node in dom.getElementsByTagName('*')]
        total_depth += sum(tag_depths) / len(tag_depths) if len(tag_depths) > 0 else 0

    # Calculate the average tag depth per document
    if total_document_count > 0:
        average_tag_depth = total_depth / total_document_count
        return average_tag_depth
    else:
        return 0.0

In [None]:
deep_tag = calculate_average_tag_depth(collection)

In [None]:
print(f'Profondeur moyenne des balises XML : {round(deep_tag)}')

In [None]:
def calculate_tag_depth_distribution(articles: list[tuple[str, minidom.Document]]) -> list[int]:
    """
    Calculate the distribution of tag depths in a collection of articles.

    Args:
        articles (List[Tuple[str, minidom.Document]]): A list of articles where each article is represented by
                                                       a tuple (id, XML document).

    Returns:
        List[int]: A list representing the distribution of tag depths.
    """
    tag_depths_distribution = []

    for article_id, dom in tqdm(articles, "calculating tag depth distribution..."):
        tag_depths = [node.parentNode.childNodes.index(node) for node in dom.getElementsByTagName('*')]
        tag_depths_distribution.extend(tag_depths)

    return tag_depths_distribution

In [None]:
def plot_tag_depth_distribution(tag_depth_distribution: list[int]) -> None:
    """
    Plot the distribution of tag depths.

    Args:
        tag_depth_distribution (List[int]): A list representing the distribution of tag depths.
    """
    plt.hist(tag_depth_distribution, bins=range(max(tag_depth_distribution) + 1), align='left', edgecolor='black')
    plt.xlabel('Tag Depth')
    plt.ylabel('Frequency')
    plt.title('Distribution of Tag Depths')
    plt.xticks(range(0, 7, 0.1))
    plt.show()

In [None]:
depth = calculate_tag_depth_distribution(collection)

In [None]:
plot_tag_depth_distribution(depth)

## Distribution des nombre de balises par document

In [None]:
def calculate_tag_count_distribution(articles: List[Tuple[str, minidom.Document]]) -> Counter:
    """
    Calculate the distribution of the number of tags per document in a collection of XML articles.

    Args:
        articles (List[Tuple[str, minidom.Document]]): A list of articles where each article is represented by
                                                       a tuple (id, XML document).

    Returns:
        Counter: A Counter object containing the distribution of the number of tags per document.
    """
    tag_count_distribution = Counter()

    for article_id, dom in tqdm(articles, "calculating tag count distribution..."):
        tag_count = len(dom.getElementsByTagName('*'))
        tag_count_distribution[tag_count] += 1

    return tag_count_distribution


In [None]:
def plot_tag_count_distribution(tag_count_distribution: Counter):
    """
    Display a histogram of the distribution of the number of tags per document.

    Args:
        tag_count_distribution (Counter): A Counter object containing the distribution of the number of tags per document.
    """
    # Extract data for plotting
    counts, frequencies = zip(*sorted(tag_count_distribution.items()))

    # Create a histogram
    plt.bar(counts, frequencies, color='lightgreen')
    plt.xlabel('Number of Tags per Document')
    plt.ylabel('Frequency')
    plt.title('Distribution of Number of Tags per Document in the XML Collection')
    plt.show()

In [None]:
tag_count_distribution = calculate_tag_count_distribution(collection)

In [None]:
plot_tag_count_distribution(tag_count_distribution)

## Nombre d'occrurences de balise XML imbriquées

In [None]:
def count_nested_tags_occurrences(articles: List[Tuple[str, minidom.Document]]) -> Counter:
    """
    Count the occurrences of nested XML tags in a collection of XML documents.

    Args:
        articles (List[Tuple[str, minidom.Document]]): A list of articles where each article is represented by
                                                       a tuple (id, XML document).

    Returns:
        Counter: A Counter object containing the occurrences of nested XML tags.
    """
    nested_tags_occurrences = Counter()

    for _, dom in articles:
        # Get all elements in the XML document
        elements = dom.getElementsByTagName('*')

        # Extract tag names and count occurrences of nested tags
        tag_stack = []
        for element in elements:
            tag_name = element.tagName
            if tag_stack:
                parent_tag = tag_stack[-1]
                nested_tag = f"{parent_tag}.{tag_name}"
                nested_tags_occurrences[nested_tag] += 1
            tag_stack.append(tag_name)

    return nested_tags_occurrences

In [None]:
def plot_nested_tags_occurrences(nested_tags_occurrences: Counter, n: int = 10):
    """
    Display a horizontal barplot of the top n most frequent nested tags.

    Args:
        nested_tags_occurrences (Counter): A Counter object containing the occurrences of nested XML tags.
        n (int, optional): The number of nested tags to display. Default is 10.
    """
    # Select the top n most frequent nested tags
    top_nested_tags = nested_tags_occurrences.most_common(n)

    # Create a horizontal barplot
    tags, occurrences = zip(*top_nested_tags)
    plt.barh(tags, occurrences, color='lightgreen')
    plt.xlabel('Occurrences')
    plt.title(f'Top {n} Most Frequent Nested XML Tags in the Collection')
    plt.show()


In [None]:
nested_tags_count = count_nested_tags_occurrences(collection)

In [None]:
plot_nested_tags_occurrences(nested_tags_count, n=25)

# Mots les plus fréquents de la collection

In [None]:
from xml.dom import minidom
from nltk.tokenize import word_tokenize
import re

def calculate_word_frequencies(articles: List[Tuple[str, minidom.Document]]) -> Counter:
    """
    Calculate the frequencies of individual words in a collection of XML articles.

    Args:
        articles (List[Tuple[str, minidom.Document]]): A list of articles where each article is represented by
                                                       a tuple (id, XML document).

    Returns:
        Counter: A Counter object containing the frequencies of individual words.
    """
    word_counter = Counter()

    for article_id, dom in tqdm(articles, "calculating word frequencies..."):
        text_content = ' '.join(node.nodeValue for node in dom.getElementsByTagName('*') if node.nodeType == node.TEXT_NODE)

        words = word_tokenize(text_content.lower())

        words = [re.sub(r'[^a-zA-Z]', '', word) for word in words if word]

        word_counter.update(words)

    return word_counter

In [None]:
def plot_word_frequencies(word_counter: Counter, top_n: int = 10):
    """
    Plot an histogram of the top word frequencies.

    Args:
        word_counter (Counter): A Counter object containing the frequencies of individual words.
        top_n (int, optional): The number of top words to display. Default is 10.
    """
    top_words = word_counter.most_common(top_n)

    words, frequencies = zip(*top_words)

    plt.figure(figsize=(10, 6))
    plt.bar(range(len(words)), frequencies, tick_label=[f"{word}" for word in words], color='c')
    plt.xlabel("Words")
    plt.ylabel("Frequency")
    plt.title(f"Top {top_n} Word Frequencies")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

In [None]:
word_frequencies = calculate_word_frequencies(collection)

In [None]:
plot_word_frequencies(word_frequencies, top_n=10)