In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [1]:
import http.client
from urllib.parse import urlencode
import json
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import huggingface_hub
import evaluate
from transformers import DataCollatorWithPadding
from sklearn.metrics import classification_report
from transformers import TrainingArguments, Trainer

In [None]:
def crawl(url):
    conn = http.client.HTTPSConnection("universalapi.thordata.com")

    payload = {
        "url": url,
        "type": "html",
        "js_render": "False"
    }

    form_data = urlencode(payload)

    headers = {
        'Authorization': "Bearer --",
        'content-type': "application/x-www-form-urlencoded"
    }

    conn.request("POST", "/request", form_data, headers)

    res = conn.getresponse()
    data = res.read()

    return data.decode("utf-8"), res.status

In [None]:
urls_to_crawl = {
    "politics": "https://www.npr.org/get/1014/render/partial/next",
    "business": "https://www.npr.org/get/1006/render/partial/next",
    "health": "https://www.npr.org/get/1128/render/partial/next",
    "science": "https://www.npr.org/get/1007/render/partial/next",
    "climate": "https://www.npr.org/get/1167/render/partial/next",
}

In [None]:
def get_text_from_article(url):
   try:
    crawled_page, status = crawl(url)
    if status != 200:
      return None
    crawled_page_json = json.loads(crawled_page)
    soup = BeautifulSoup(crawled_page_json['html'], 'html.parser')
    div = soup.find('div', id='storytext')
    if div is None:
      return None
    text_dev = div.get_text(strip=True, separator='\n')
    return text_dev
   except:
    return "there is an error in 'get_text_from_article'"

In [None]:
def get_article_from_url(url, batch_size=10):
  try:
    start = 1
    while True:
      url = f"{url}?start={start}&count={batch_size}"
      crawled_page, status = crawl(url)
      if status != 200:
        return None
      crawled_page_json = json.loads(crawled_page)['html']
      soup = BeautifulSoup(crawled_page_json, 'html.parser')
      for article in soup.find_all('article'):
        href = article.find('a')['href']
        if href is None:
          continue
        text = get_text_from_article(href)
        if text is None:
          continue
        yield text
      start += batch_size
  except:
    return "there is an error in 'get_article_from_url'"

In [None]:
data = []
for category, url in urls_to_crawl.items():
  print(f"aricles crwaled {category}: ")
  articles_crawled = 1
  for article_text in get_article_from_url(url):
     data.append({"news_category": category, "article": article_text})
     print(f"{articles_crawled}")
     articles_crawled += 1
     if articles_crawled > 100:
      break

aricles crwaled politics: 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
aricles crwaled business: 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
aricles crwaled health: 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
aricles crwaled science: 
1
2
3
4
5
6
7
8
9
1

In [None]:
df = pd.DataFrame(data)
df.to_csv("articles.csv", index=False)

##Data Cleaning and Preparation

In [2]:
df = pd.read_csv("articles.csv")
df.head()

Unnamed: 0,news_category,article
0,politics,A pair of Senate Democrats is responding to\na...
1,politics,Gold is having its hottest year in nearly half...
2,politics,there is an error in 'get_text_from_article'
3,politics,"James Comey, former FBI director, speaks at a ..."
4,politics,"From left, Senate Judiciary Committee Chairman..."


In [3]:
def clean(text):
  text = BeautifulSoup(text, 'html.parser').text
  text = re.sub(r'\s+', ' ', text).strip()
  text = re.sub(r'\\+', '', text)
  return text

df['article'] = df['article'].apply(clean)

In [4]:
 le = LabelEncoder()
 le.fit(df['news_category'])
 df['label'] = le.transform(df['news_category'])

In [5]:
data = df[['article', 'label']]
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [6]:
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

##Tokinizer and Model

In [None]:
model_name = 'Qwen/Qwen2.5-0.5B'
hf_token = '--'
huggingface_hub.login(hf_token)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
def preprocess_func(examples):
  return tokenizer(examples['article'], truncation=True)

tokenized_train = train_dataset.map(preprocess_func, batched=True)
tokenized_test = test_dataset.map(preprocess_func, batched=True)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [15]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
import torch
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(le.classes_), dtype=torch.bfloat16)
model.config.pad_token_id = model.config.eos_token_id
# model.gradient_checkpointing_enable()

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-0.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
total_layers = 0
for params in model.base_model.parameters():
  total_layers +=1

total_layers

290

In [12]:
frozen_layer = 0
for params in model.base_model.parameters():
   if frozen_layer > total_layers - 50:
     break
   params.requires_grad = False
   frozen_layer +=1
print(frozen_layer)

241


In [13]:
metric = evaluate.load('accuracy')
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [18]:
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    bf16=True,
    report_to='none',
    save_steps=1000,
    gradient_accumulation_steps=8
)
    #  , gradient_checkpointing=True


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()

  trainer = Trainer(


Step,Training Loss


TrainOutput(global_step=125, training_loss=3.56856005859375, metrics={'train_runtime': 1922.2279, 'train_samples_per_second': 1.04, 'train_steps_per_second': 0.065, 'total_flos': 6977060499225600.0, 'train_loss': 3.56856005859375, 'epoch': 5.0})

##Evaluation

In [19]:
pred = trainer.predict(tokenized_train)
pred = np.argmax(pred[:3][0], axis=-1)
gt = train_data['label'].to_list()
eval_train = classification_report(gt, pred)
eval_train

'              precision    recall  f1-score   support\n\n           0       0.81      1.00      0.90        86\n           1       0.92      1.00      0.96        76\n           2       0.92      0.81      0.86        90\n           3       1.00      0.64      0.78        72\n           4       0.80      0.91      0.85        76\n\n    accuracy                           0.88       400\n   macro avg       0.89      0.87      0.87       400\nweighted avg       0.89      0.88      0.87       400\n'

In [None]:
       precision    recall  f1-score   support

           0       0.81      1.00      0.90        86
           1       0.92      1.00      0.96        76
           2       0.92      0.81      0.86        90
           3       1.00      0.64      0.78        72
           4       0.80      0.91      0.85        76

    accuracy                           0.88       400
   macro avg       0.89      0.87      0.87       400
weighted avg       0.89      0.88      0.87       400

In [20]:
pred = trainer.predict(tokenized_test)
pred = np.argmax(pred[:3][0], axis=-1)
gt = test_data['label'].to_list()
eval_test = classification_report(gt, pred)
eval_test

'              precision    recall  f1-score   support\n\n           0       0.67      1.00      0.80        14\n           1       0.89      1.00      0.94        24\n           2       0.70      0.70      0.70        10\n           3       1.00      0.64      0.78        28\n           4       0.88      0.88      0.88        24\n\n    accuracy                           0.84       100\n   macro avg       0.83      0.84      0.82       100\nweighted avg       0.87      0.84      0.84       100\n'

In [None]:
              precision    recall  f1-score   support

           0       0.67      1.00      0.80        14
           1       0.89      1.00      0.94        24
           2       0.70      0.70      0.70        10
           3       1.00      0.64      0.78        28
           4       0.88      0.88      0.88        24

    accuracy                           0.84       100
   macro avg       0.83      0.84      0.82       100
weighted avg       0.87      0.84      0.84       100


##Saving Locally and Push to hugging Face Hub

In [22]:
model.config.id2label = {i: label for i, label in enumerate(le.classes_)}
model.config.label2id = {label: i for i, label in enumerate(le.classes_)}
trainer.save_model("./news_classifier_model")
tokenizer.save_pretrained("./news_classifier_model")

('./news_classifier_model/tokenizer_config.json',
 './news_classifier_model/special_tokens_map.json',
 './news_classifier_model/chat_template.jinja',
 './news_classifier_model/vocab.json',
 './news_classifier_model/merges.txt',
 './news_classifier_model/added_tokens.json',
 './news_classifier_model/tokenizer.json')

In [None]:
model.push_to_hub("news_classifier_model")
tokenizer.push_to_hub("news_classifier_model")
trainer.push_to_hub("news_classifier_model")

##Inference

In [23]:
from transformers import pipeline
classifier = pipeline(
    "text-classification",
    model="./news_classifier_model",
    tokenizer="./news_classifier_model"
)

Device set to use cuda:0


In [24]:
report = """Most of the federal government is shut down after Senate Republicans and Democrats failed to reach a funding agreement to keep it open.
Democrats wanted to extend Affordable Care Act (ACA) subsidies used to make health insurance premiums more affordable for millions, but Republicans said they wanted to discuss extending the credits after reaching an agreement to fund the government. Democrats also wanted to repeal cuts made to healthcare programs by President Trump's signature legislation — the One Big Beautiful Bill Act.
Trump has blamed Democrats for the shutdown and said last week he would look for a way to do a "reduction in force" of federal workers in the event the government closed."""

response = classifier(report)
response

[{'label': 'politics', 'score': 0.9864209890365601}]

In [31]:
report = """The device is about the size of a AA battery, and it has the potential to help a baby or infant heart keep beating in the face of failure.
It's called the PediaFlow, an implantable artificial heart for the littlest, most vulnerable humans. James Antaki, a biomedical engineer at Cornell University in New York, has been developing this medical device for the last two decades.
As of last spring, it was in the final stages of research and manufacturing before clinical trials, funded by a $6 million, multiyear grant from the Department of Defense.
"""

response = classifier(report)
response

[{'label': 'science', 'score': 0.9990949630737305}]