In [None]:
# Import Base libs for read and show data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn 
import pathlib as path
from datasets import load_dataset

In [None]:
# const values
BASE_DIR = path.PosixPath('train.csv')

In [None]:
# Load dataset from CSV file
dataset_t = load_dataset('sapienzanlp/nlp2025_hw1_cultural_dataset')['train'].to_pandas()
dataset_v = load_dataset('sapienzanlp/nlp2025_hw1_cultural_dataset')['validation'].to_pandas()

In [None]:
# Visualize some dataset entry
dataset_t.head(n=10)

In [None]:
# Plot values distribution to observer particular carachteristics
# test dataset
category_dist_t = pd.Series.value_counts((dataset_t['category']))
type_dist_t = pd.Series.value_counts((dataset_t['type']))
sub_dist_t = pd.Series.value_counts((dataset_t['subcategory']))
label_dist_t = pd.Series.value_counts((dataset_t['label']))
# validation dataset
category_dist_v = pd.Series.value_counts((dataset_v['category']))
type_dist_v = pd.Series.value_counts((dataset_v['type']))
sub_dist_v = pd.Series.value_counts((dataset_v['subcategory']))
label_dist_v = pd.Series.value_counts((dataset_v['label']))

print(label_dist_t)

# Gaussian distribution
print(sub_dist_t)

# unbalanced
print(type_dist_t)

# category
print(category_dist_t)


## Category Analysis

Look up to exploit some information on category, analyzing statistical aspects such as data distribution

In [None]:
# Plots category distribution
plt.figure(figsize=(30,12))
plt.title('Category Distribution')
plt.xlabel('category')
plt.ylabel('num of occurance')
plt.bar(category_dist_t.index, category_dist_t, 0.5, label='Train')

plt.bar(category_dist_v.index, category_dist_v, 0.5, label='Test')
plt.legend(loc='upper left', ncols=2)

## Subcategory Analysis

Look up to exploit some information on category analyzing statistical aspects, such as data distribution and visual rappresentation

In [None]:
# Plots category distribution

plt.figure(figsize=(14,20))
plt.title('Subcategory Distribution')
plt.ylabel('Subcategory')
plt.xlabel('Num of Occurance')
plt.barh(sub_dist_t.index, sub_dist_t, 0.5, label='Train')
plt.barh(sub_dist_v.index, sub_dist_v, 0.5, label='Test')
plt.legend(loc='upper left', ncols=2)

## Labels Analysis

Check the number of classes to identify, this analysis is useful to find unbalanced data factor

In [None]:
# Plots category distribution

plt.figure()
plt.title('Labels Distribution')
plt.ylabel('Labels')
plt.xlabel('Num of Occurance')
plt.barh(label_dist_t.index, label_dist_t, label='Train')
plt.barh(label_dist_v.index, label_dist_v, label='Test')
plt.legend(loc='upper left', ncols=2)

## Type Analysis

In [None]:
# Plots category distribution
plt.figure(figsize=(30,12))
plt.title('Category Distribution')
plt.xlabel('Category')
plt.ylabel('Num of Occurance')
plt.bar(type_dist_t.index, type_dist_t, 0.5, label='Train')
plt.bar(type_dist_v.index, type_dist_v, 0.5, label='Test')
plt.legend(loc='upper left', ncols=2)

## Languages Analysis

In [None]:
#TODO

## Links Analysis

In [None]:
#TODO

## Hyperlinks Modelling

### Parse of Links

In [None]:
import requests
from bs4 import BeautifulSoup
import re

def extract_relevant_links(response, maxlinks):
    data = response.json()
    pages = data.get("query", {}).get("pages", {})

    valid_links = []

    for page_id in pages:
        page = pages[page_id]
        links = page.get("links", [])
        print(f"Number of Total Links Found: {len(links)}")
        for link in links:
            title = link["title"]

            # Filtri per escludere link inutili
            if ":" in title or "#" in title:
                continue
            if not re.match(r'^[A-Za-z0-9 \-()%]+$', title):
                continue
            if len(title) < 2 or title.lower() in ['edit', 'citation']:
                continue

            if title not in valid_links:
                valid_links.append(title)
            if False and len(valid_links) >= maxlinks:
                break
        
    print(f"Number of Parsed Links Found: {len(valid_links)}")

    return valid_links

In [18]:
import networkx as nx
import matplotlib.pyplot as plt


def BFS_Links(G: nx.DiGraph, base: str, limit: int = 50, depth: int = 5) -> nx.DiGraph:
    # Add the base node if it's not already in the graph
    print(depth)
    if not depth:
        return G
    
    if not G.has_node(base):
        G.add_node(base, count=1)
        G.nodes[base]['visited'] = False  # mark as not visited

    if G.nodes[base]['visited']:  
        for n in G.neighbors(base):
            G.nodes[n]['count'] += 1

        return G
           
    url = f"https://en.wikipedia.org/w/api.php?action=query&titles={base}&prop=links&pllimit=max&format=json"
    params = {
        "action": "query",
        "titles": "Human",
        "prop": "links",
        "pllimit": "max",
        "plnamespace": "0",  # Solo namespace 0 per evitare link non pertinenti
        "format": "json"
    }

    try:
        response = requests.get(url, timeout=10, params=params)
        response.raise_for_status()
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error: {http_err}")
        return G
    except requests.exceptions.Timeout:
        print("Request timed out.")
        return G
    except requests.exceptions.RequestException as err:
        print(f"Request error: {err}")
        return G
    except ValueError as json_err:
        print(f"JSON parsing error: {json_err}")
        return G


    if depth == 0:
        return G

    # Extract links from the API response
    links = extract_relevant_links(response, maxlinks=limit)

    G.nodes[base]['visited'] = True  # mark base as visited

    if len(links) == 0:
        return G
    
    # Explore the links
    for link in links:
        if not G.has_node(link):
            G.add_node(link, count=1)  # new node → count as first visit
            G.nodes[link]['visited'] = False  # mark as not visited

        G.add_edge(base, link)  # add edge between base and link

        print(f"Node '{link}' has been seen {G.nodes[link]['count']} times.")
        
    

    # Recursive call to explore all links
    for link in G.neighbors(base):
        G = BFS_Links(G, link, limit, depth - 1) # recursive call
        

    return G


# Initialize an empty graph
G = nx.DiGraph()

# Example starting point
start_page = "Human"
url = f"https://en.wikipedia.org/w/api.php?action=query&titles={start_page}&prop=links&pllimit=max&format=json"
# Add the starting page to the graph
# Recall the BFS_Links function
G = BFS_Links(G, start_page, limit=50, depth=5)

# Now we have a graph with nodes and edges representing Wikipedia pages and links between them

# Example analysis: Draw the graph
plt.figure(figsize=(10, 10))
pos = nx.spring_layout(G)
nx.draw(G,pos=pos, with_labels=True, node_size=50, node_color="skyblue", font_size=10, font_weight="bold")
plt.title(f"Graph of Wikipedia Links for '{start_page}'")
plt.show()

# PageRank analysis (calculating importance of nodes)
pagerank = nx.pagerank(G)
print("PageRank of nodes:", pagerank)

5
Number of Total Links Found: 500
Number of Parsed Links Found: 491
Node '10th edition of Systema Naturae' has been seen 1 times.
Node 'Abstraction' has been seen 1 times.
Node 'Accelerating change' has been seen 1 times.
Node 'Achaemenid Empire' has been seen 1 times.
Node 'Actionbioscience' has been seen 1 times.
Node 'Adaptation' has been seen 1 times.
Node 'Adolescence' has been seen 1 times.
Node 'Adolescent' has been seen 1 times.
Node 'Adoption' has been seen 1 times.
Node 'Adult' has been seen 1 times.
Node 'Aesthetics' has been seen 1 times.
Node 'Affect (psychology)' has been seen 1 times.
Node 'Affinity (law)' has been seen 1 times.
Node 'African continent' has been seen 1 times.
Node 'Age of Discovery' has been seen 1 times.
Node 'Age of Enlightenment' has been seen 1 times.
Node 'Age of Revolution' has been seen 1 times.
Node 'Agriculture' has been seen 1 times.
Node 'Air conditioning' has been seen 1 times.
Node 'Airplane' has been seen 1 times.
Node 'Akkadian language' 

KeyboardInterrupt: 