In [None]:
# Import Base libs for read and show data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn 
import pathlib as path
from datasets import load_dataset

In [None]:
# const values
BASE_DIR = path.PosixPath('train.csv')

In [None]:
# Load dataset from CSV file
dataset_t = load_dataset('sapienzanlp/nlp2025_hw1_cultural_dataset')['train'].to_pandas()
dataset_v = load_dataset('sapienzanlp/nlp2025_hw1_cultural_dataset')['validation'].to_pandas()

In [None]:
# Visualize some dataset entry
dataset_t.head(n=10)

In [None]:
# Plot values distribution to observer particular carachteristics
# test dataset
category_dist_t = pd.Series.value_counts((dataset_t['category']))
type_dist_t = pd.Series.value_counts((dataset_t['type']))
sub_dist_t = pd.Series.value_counts((dataset_t['subcategory']))
label_dist_t = pd.Series.value_counts((dataset_t['label']))
# validation dataset
category_dist_v = pd.Series.value_counts((dataset_v['category']))
type_dist_v = pd.Series.value_counts((dataset_v['type']))
sub_dist_v = pd.Series.value_counts((dataset_v['subcategory']))
label_dist_v = pd.Series.value_counts((dataset_v['label']))

print(label_dist_t)

# Gaussian distribution
print(sub_dist_t)

# unbalanced
print(type_dist_t)

# category
print(category_dist_t)


## Category Analysis

Look up to exploit some information on category, analyzing statistical aspects such as data distribution

In [None]:
# Plots category distribution
plt.figure(figsize=(30,12))
plt.title('Category Distribution')
plt.xlabel('category')
plt.ylabel('num of occurance')
plt.bar(category_dist_t.index, category_dist_t, 0.5, label='Train')

plt.bar(category_dist_v.index, category_dist_v, 0.5, label='Test')
plt.legend(loc='upper left', ncols=2)

## Subcategory Analysis

Look up to exploit some information on category analyzing statistical aspects, such as data distribution and visual rappresentation

In [None]:
# Plots category distribution

plt.figure(figsize=(14,20))
plt.title('Subcategory Distribution')
plt.ylabel('Subcategory')
plt.xlabel('Num of Occurance')
plt.barh(sub_dist_t.index, sub_dist_t, 0.5, label='Train')
plt.barh(sub_dist_v.index, sub_dist_v, 0.5, label='Test')
plt.legend(loc='upper left', ncols=2)

## Labels Analysis

Check the number of classes to identify, this analysis is useful to find unbalanced data factor

In [None]:
# Plots category distribution

plt.figure()
plt.title('Labels Distribution')
plt.ylabel('Labels')
plt.xlabel('Num of Occurance')
plt.barh(label_dist_t.index, label_dist_t, label='Train')
plt.barh(label_dist_v.index, label_dist_v, label='Test')
plt.legend(loc='upper left', ncols=2)

## Type Analysis

In [None]:
# Plots category distribution
plt.figure(figsize=(30,12))
plt.title('Category Distribution')
plt.xlabel('Category')
plt.ylabel('Num of Occurance')
plt.bar(type_dist_t.index, type_dist_t, 0.5, label='Train')
plt.bar(type_dist_v.index, type_dist_v, 0.5, label='Test')
plt.legend(loc='upper left', ncols=2)

## Languages Analysis

In [None]:
#TODO

## Links Analysis

In [None]:
#TODO

## Hyperlinks Modelling

In [None]:
import networkx as nx
import requests
import matplotlib.pyplot as plt

def BFS_Links(G: nx.DiGraph, base: str, limit: int = 50, depth: int = 5) -> nx.DiGraph:
    url = f"https://en.wikipedia.org/w/api.php?action=query&titles={base}&prop=links&pllimit=max&format=json"

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        data = response.json()
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error: {http_err}")
        return G
    except requests.exceptions.Timeout:
        print("Request timed out.")
        return G
    except requests.exceptions.RequestException as err:
        print(f"Request error: {err}")
        return G
    except ValueError as json_err:
        print(f"JSON parsing error: {json_err}")
        return G

    # Add the base node if it's not already in the graph
    if not G.has_node(base):
        print(f'Added node: {base}')
        G.add_node(base, count=0)

    if depth == 0:
        return G

    # Extract links from the API response
    links = []
    pages = data['query']['pages']
    for page_id in pages:
        page = pages[page_id]
        if "links" in page:
            links = [link['title'] for link in page['links'][:min(limit, len(page['links']))]]

    if not links:
        return G

    # Explore the links
    for link in links:
        if G.has_node(link):
            G.nodes[link]['count'] += 1  # already present → increment counter
        else:
            G.add_node(link, count=1)  # new node → count as first visit

        G.add_edge(base, link)  # add edge between base and link

        print(f"Node '{link}' has been seen {G.nodes[link]['count']} times.")
        G = BFS_Links(G, link, limit, depth - 1)

    return G


# Initialize an empty graph
G = nx.DiGraph()

# Example starting point
start_page = "Human"
url = f"https://en.wikipedia.org/w/api.php?action=query&titles={start_page}&prop=links&pllimit=max&format=json"
# Add the starting page to the graph
# Recall the BFS_Links function
G = BFS_Links(G, start_page, limit=5, depth=3)

# Now we have a graph with nodes and edges representing Wikipedia pages and links between them

# Example analysis: Draw the graph
plt.figure(figsize=(10, 10))
pos = nx.spring_layout(G)
nx.draw(G,pos=pos, with_labels=True, node_size=50, node_color="skyblue", font_size=5, font_weight="bold")
plt.title(f"Graph of Wikipedia Links for '{start_page}'")
plt.show()

# PageRank analysis (calculating importance of nodes)
pagerank = nx.pagerank(G)
print("PageRank of nodes:", pagerank)