In [124]:
!pip install datasets scienceplots

Collecting scienceplots
  Downloading SciencePlots-2.1.1-py3-none-any.whl.metadata (11 kB)
Downloading SciencePlots-2.1.1-py3-none-any.whl (16 kB)
Installing collected packages: scienceplots
Successfully installed scienceplots-2.1.1


In [125]:
!nvidia-smi

Fri Feb 21 10:32:22 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   54C    P0             29W /   70W |   11016MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [126]:
from datasets import load_dataset, Dataset
from huggingface_hub import notebook_login
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
import time
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import aiohttp
import asyncio
from tqdm.asyncio import tqdm
import torch
import scienceplots
plt.style.use(['science', 'no-latex'])

tqdm.pandas()

In [2]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Dataset Annalysis

We are creating a dataset of approximately 5,000 Portuguese texts to fine-tune a Small Language Model for summarization.  

To ensure high-quality data that enhances model performance, our corpus will:  

- Incorporate a mix of language styles, from formal to casual.  
- Cover a wide range of topics across various domains of knowledge.  
- Include texts of varying lengths to provide diverse contextual scenarios.


We will start with texts from wikipedia

In [2]:
dataset = load_dataset("wikimedia/wikipedia", "20231101.pt")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
wikipedia_df = dataset['train'].to_pandas()

In [None]:
pages = wikipedia_df['title'].str.replace(" ", "_").tolist()

In [29]:


# Wikipedia Pageviews API URL and date range
PAGEVIEWS_API_URL = ("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"
"pt.wikipedia.org/all-access/all-agents/{}/daily/{}/{}")
START_DATE = "20230101"
END_DATE = "20240201"

# Async function to fetch page views for a given Wikipedia page title
async def get_page_views(session, page_title):
    # Format the title for the API (replace spaces with underscores)
    url = PAGEVIEWS_API_URL.format(page_title, START_DATE, END_DATE)

    try:
        async with session.get(url) as response:
            if response.status == 200:
                data = await response.json()
                total_views = sum(item["views"] for item in data["items"])
                return (page_title, total_views)
            else:

                return (page_title, np.nan)
    except Exception as e:
        return (page_title, "Error")

# Async function to fetch page views in batches (e.g., 1000 pages per batch)
async def fetch_page_views_in_batches(pages, batch_size=1000):
    results = []
    async with aiohttp.ClientSession() as session:
        # Create a progress bar with the total number of pages
        pbar = tqdm(total=len(pages), desc="Fetching page views")
        for i in range(0, len(pages), batch_size):
            batch = pages[i:i+batch_size]
            tasks = [get_page_views(session, page) for page in batch]
            batch_results = await asyncio.gather(*tasks)
            results.extend(batch_results)
            pbar.update(len(batch))
        pbar.close()
    return results

# Example main async function
async def main():
    # Replace with your list of Wikipedia page titles (not URLs)
    pages = wikipedia_df['title'].str.replace(" ", "_").tolist()

    results = await fetch_page_views_in_batches(pages, batch_size=1_000)

    # Process results (for example, printing them)
    # for page, views in results:
    #     print(f"{page}: {views}")
    return results
# Run the async event loop
if __name__ == "__main__":
    results = asyncio.run(main())


Fetching page views: 100%|██████████| 1112246/1112246 [14:20<00:00, 1292.28it/s]


In [33]:
num_views_df = pd.DataFrame(results, columns = ['title', 'num_views'])
num_views_df['title'] = num_views_df['title'].str.replace('_', ' ')

In [35]:
num_views_df.dropna().sort_values('num_views')

Unnamed: 0,title,num_views
826005,+1,6.0
533001,Rio Brebu (Slănic),28.0
538578,Para Todo Mundo,28.0
1025577,Área de Conservação da Paisagem de Päite,30.0
796111,Asócio I,30.0
...,...,...
362,Estados Unidos,3007885.0
405,Clube de Regatas do Flamengo,3391981.0
29018,Cristiano Ronaldo,3704512.0
59000,AMBEV,4307627.0


In [38]:
wikipedia_with_views = pd.merge(
    wikipedia_df,
    num_views_df,
    on = 'title'
)

In [41]:
dataset = Dataset.from_pandas(wikipedia_with_views)
dataset.push_to_hub("peulsilva/wikipedia-pt")

Uploading the dataset shards:   0%|          | 0/6 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/186 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/186 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/186 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/186 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/186 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/186 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/peulsilva/wikipedia-pt/commit/34c196f86dc9523183150bf0d32873811241e839', commit_message='Upload dataset', commit_description='', oid='34c196f86dc9523183150bf0d32873811241e839', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/peulsilva/wikipedia-pt', endpoint='https://huggingface.co', repo_type='dataset', repo_id='peulsilva/wikipedia-pt'), pr_revision=None, pr_num=None)

# Data Selection

Now that we have the number of views for each wikipedia page in portuguese, we will take 5000 texts from these ones

In [6]:
wikipedia_with_views = load_dataset("peulsilva/wikipedia-pt")\
    ['train']\
    .to_pandas()

In [10]:
wikipedia_with_views['num_words'] = wikipedia_with_views['text']\
    .apply(lambda x: len(x.split(' ')))

In [11]:
wikipedia_with_views['num_words'].describe()

Unnamed: 0,num_words
count,1112246.0
mean,361.3684
std,875.0046
min,1.0
25%,52.0
50%,119.0
75%,339.0
max,69594.0


In [12]:
wikipedia_with_views.dropna()\
    .sort_values('num_views')\
    .tail(10)\
    .title

Unnamed: 0,title
882,Rússia
168102,Al-Nassr Football Club
1541,Israel
959,Terra
415211,Neymar
362,Estados Unidos
405,Clube de Regatas do Flamengo
29018,Cristiano Ronaldo
59000,AMBEV
126,Brasil


If we choose the top 5000, we would have a very low diversity. Note that in the top 10 pages, 4 are country names and other 4 are related to football

In [26]:
model = AutoModelForSequenceClassification.from_pretrained(
    "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
    torch_dtype=torch.float16
).to('cuda')
tokenizer = AutoTokenizer.from_pretrained("MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")

BATCH_SIZE = 8

# Initialize the pipeline with the model and tokenizer; set device=0 to use GPU
classifier = pipeline(
    "zero-shot-classification",
    model=model,
    tokenizer=tokenizer,
    device=0,  # ensure you're using a GPU,
    batch_size = BATCH_SIZE,
    padding = 'max_length'
)
# Define candidate labels (topics)


Device set to use cuda:0


In [56]:
def get_first_paragraphs(text):
    """Extracts the first paragraph from a given text."""
    paragraphs = text.split("\n\n")
    return '\n\n'.join(paragraphs[0:2]) if paragraphs else text.strip()

In [57]:
df = wikipedia_with_views.dropna()

df['first_paragraph'] = df['text'].apply(get_first_paragraphs)

null_mask = (df["first_paragraph"].str.len()  > 0) & (df["first_paragraph"].apply(lambda x: len(x.split(' ')))< 500)
df = df[null_mask]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['first_paragraph'] = df['text'].apply(get_first_paragraphs)


In [62]:

candidate_labels = ["politica e negócios", "esporte", "ciência e tecnologia", "cultura e arte",  "geografia e historia"]

# Container for the predicted topics
predicted_topics = []

for i in tqdm(range(0, len(df), BATCH_SIZE), desc="Processing batches"):
    batch_texts = df["first_paragraph"].iloc[i: i + BATCH_SIZE].tolist()
    # Classify the batch of texts
    batch_results = classifier(
        batch_texts,
        candidate_labels,
        hypothesis_template = "Esse texto é sobre: {}"
    )

    # When classifying a list of texts, the pipeline returns a list of results.
    # For each result, take the top label as the predicted topic.
    for res in batch_results:
        predicted_topics.append(res["labels"][0])

# Add the predictions to the DataFrame
df["predicted_topic"] = predicted_topics

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Push the dataset to the Hugging Face Hub
# Replace 'your-username/your-dataset-name' with your repository name.
# Ensure that you have the proper authentication token.
dataset.push_to_hub("peulsilva/wikipedia-pt-topics2")

Processing batches: 100%|██████████| 10848/10848 [53:06<00:00,  3.40it/s]


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/87 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/peulsilva/wikipedia-pt-topics2/commit/93ddac9cf2809b6ea1873486208b6915d02b938a', commit_message='Upload dataset', commit_description='', oid='93ddac9cf2809b6ea1873486208b6915d02b938a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/peulsilva/wikipedia-pt-topics2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='peulsilva/wikipedia-pt-topics2'), pr_revision=None, pr_num=None)