# Install Requirements

In [None]:
!pip install datasets
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.4


# Crawl Data

In [None]:
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
def crawl(url_to_crawl):
  url = "https://scraper-api.decodo.com/v2/scrape"

  payload = {
        "url": url_to_crawl
  }

  headers = {
      "accept": "application/json",
      "content-type": "application/json",
      "authorization": "Basic VTAwMDAyODQ4ODE6UFdfMTJjODgzZmQyNDY5OWZjNGU1ZWJkYzYzYTE5MmVjZTQ3"
  }

  response = requests.post(url, json=payload, headers=headers)

  return response

In [None]:
def get_article_text(article_url):
  try:
    crawled_article = crawl(article_url)
    crawled_article_json = json.loads(crawled_article.text)

    status_code = crawled_article_json['results'][0]['status_code']
    if status_code != 200:
      return None

    html_string = crawled_article_json['results'][0]['content']
    soup = BeautifulSoup(html_string, 'html.parser')

    # Get Article Text
    story_div = soup.find('div', id='storytext')
    if story_div is None:
      return None

    text = story_div.get_text(strip=True, separator='\n')

    return text
  except Exception as e:
    print(e)
    return None

In [None]:
def get_next_article(category_url, batch_size=10):
  start_index = 1
  while True:
    crawled_page = crawl(f"{category_url}?start={start_index}&count={batch_size}")
    crawled_page_json = json.loads(crawled_page.text)

    status_code = crawled_page_json['results'][0]['status_code']
    if status_code != 200:
      break

    html_string = crawled_page_json['results'][0]['content']
    soup = BeautifulSoup(html_string, 'html.parser')


    for article in soup.find_all('article'):
      anchor_tag = article.find('a')
      if anchor_tag is None:
        continue
      article_url = anchor_tag['href']
      article_text = get_article_text(article_url)
      if article_text is None:
        continue
      yield article_text

    start_index += batch_size

In [None]:
urls_to_crawl = {
    "politics": "https://www.npr.org/get/1014/render/partial/next",  #?start=11&count=20
    "business": "https://www.npr.org/get/1006/render/partial/next",
    "health": "https://www.npr.org/get/1128/render/partial/next",
    "science": "https://www.npr.org/get/1007/render/partial/next",
    "climate": "https://www.npr.org/get/1167/render/partial/next"
}

In [None]:
data = []

for news_category, category_url in urls_to_crawl.items():
  print(f"Crawling: {news_category}")
  articles_crawled = 0
  for article_text in get_next_article(category_url):
    data.append({'news_category': news_category, 'article': article_text})
    articles_crawled += 1
    print(f"Crawled: {articles_crawled} articles")
    if articles_crawled >= 100:
      break

Crawling: politics
Crawled: 1 articles
Crawled: 2 articles
Crawled: 3 articles
Crawled: 4 articles
Crawled: 5 articles
Crawled: 6 articles
Crawled: 7 articles
Crawled: 8 articles
Crawled: 9 articles
Crawled: 10 articles
Crawled: 11 articles
Crawled: 12 articles
Crawled: 13 articles
Crawled: 14 articles
Crawled: 15 articles
Crawled: 16 articles
Crawled: 17 articles
Crawled: 18 articles
Crawled: 19 articles
Crawled: 20 articles
Crawled: 21 articles
Crawled: 22 articles
Crawled: 23 articles
Crawled: 24 articles
Crawled: 25 articles
Crawled: 26 articles
Crawled: 27 articles
Crawled: 28 articles
Crawled: 29 articles
Crawled: 30 articles
Crawled: 31 articles
Crawled: 32 articles
Crawled: 33 articles
Crawled: 34 articles
Crawled: 35 articles
Crawled: 36 articles
Crawled: 37 articles
Crawled: 38 articles
Crawled: 39 articles
Crawled: 40 articles
Crawled: 41 articles
Crawled: 42 articles
Crawled: 43 articles
Crawled: 44 articles
'results'
Crawled: 45 articles
Crawled: 46 articles
Crawled: 47 ar

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_csv('news_articles_dataset.csv', index=False)

# 1. Parameters and Reading Dataset

In [None]:
import pandas as pd
import huggingface_hub

In [None]:
dataset_csv_path = 'news_articles_dataset.csv'
text_column_name = 'article'
label_column_name = 'news_category'
test_size = 0.2
num_labels = 2

model_name = 'meta-llama/Llama-3.2-1B'
hf_token = 'YOUR TOKEN HERE'

In [None]:
df = pd.read_csv('news_articles_dataset.csv')
num_labels = df['news_category'].nunique()

In [None]:
num_labels

5

In [None]:
huggingface_hub.login(hf_token)

# 2. Clean Data

In [None]:
from bs4 import BeautifulSoup
import re

In [None]:
class Cleaner():
  def __init__(self):
    pass
  def remove_html_tags(self, text):
    clean_text = BeautifulSoup(text, 'lxml').text
    return clean_text
  def remove_double_spaces(self, text):
    clean_text = re.sub(r' +', ' ', text)
    return clean_text
  def clean(self, text):
    clean_text = self.remove_html_tags(text)
    clean_text = self.remove_double_spaces(clean_text)
    return clean_text

In [None]:
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

# 3. Wrangle Data

## Label Encoder

In [None]:
from sklearn import preprocessing

In [None]:
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())
df['label'] = le.transform(df[label_column_name].tolist())

## Train/Test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train, df_test = train_test_split(df, test_size=test_size)

In [None]:
df_train.shape, df_test.shape

((400, 4), (100, 4))

In [None]:
df_train = df_train[['text_cleaned', 'label']]
df_test = df_test[['text_cleaned', 'label']]

In [None]:
df_train.head()

Unnamed: 0,text_cleaned,label
372,Enlarge this image\nThe inside of a cell is a ...,4
307,A 5-year-old Customs and Border Protection bea...,4
457,"""You do not have to spend hundreds of dollars ...",1
109,Enlarge this image\nGetty Images\nGetty Images...,0
128,An AI-generated image of a fighter plane shot ...,0


## Convert to HuggingFace Dataset

In [None]:
from datasets import Dataset

In [None]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

## Tokenizer

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text_cleaned"], truncation=True)

In [None]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

# 4. Initialize the model

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config.pad_token_id = model.config.eos_token_id

In [None]:
number_of_layers = 0
for param in model.base_model.parameters():
    number_of_layers += 1
print(f"Number of Layers: {number_of_layers}")

Number of Layers: 146


In [None]:
layer_no = 0
for param in model.base_model.parameters():
  if layer_no >= number_of_layers - 25:
    break
  number_of_layers += 1
  param.requires_grad = False

# 5. Train the model

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
  logits,labels = eval_pred
  predictions = np.argmax(logits,axis=-1)
  return metric.compute(predictions=predictions,references=labels)

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,

    report_to="none",
    fp16=True,

    learning_rate=2e-4,
    weight_decay=0.01,

    save_steps=2000
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

Step,Training Loss
500,1.4245


Step,Training Loss
500,1.4245
1000,0.7855
1500,0.6202
2000,0.5152


TrainOutput(global_step=2000, training_loss=0.8363398132324219, metrics={'train_runtime': 1173.4189, 'train_samples_per_second': 3.409, 'train_steps_per_second': 1.704, 'total_flos': 3.912715636993229e+16, 'train_loss': 0.8363398132324219, 'epoch': 10.0})

In [None]:
le.classes_

array(['business', 'climate', 'health', 'politics', 'science'],
      dtype='<U8')

In [None]:
model.config.id2label = {i: label for i, label in enumerate(le.classes_)}
model.config.label2id = {label:i for i, label in enumerate(le.classes_)}

In [None]:
trainer.save_model('./news_classifier_model')
tokenizer.save_pretrained('./news_classifier_model')

('./news_classifier_model/tokenizer_config.json',
 './news_classifier_model/special_tokens_map.json',
 './news_classifier_model/tokenizer.json')

In [39]:
# Save in HuggingFace hub -- Make sure to have your token to have Write acess
model.push_to_hub("news-classifier-model")
trainer.push_to_hub("news-classifier-model")
tokenizer.push_to_hub("news-classifier-model")

model.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mgulati3/news-classifier-model/commit/f84446a512e292ec58bc7233c7a93ef240346437', commit_message='Upload tokenizer', commit_description='', oid='f84446a512e292ec58bc7233c7a93ef240346437', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mgulati3/news-classifier-model', endpoint='https://huggingface.co', repo_type='model', repo_id='mgulati3/news-classifier-model'), pr_revision=None, pr_num=None)

# 6. Evaluate Model

In [40]:
from sklearn.metrics import classification_report

In [41]:
preds = trainer.predict(tokenized_train)
preds = np.argmax(preds[:3][0],axis=-1)
GT = df_train['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87        84
           1       0.74      0.86      0.80        78
           2       0.84      0.86      0.85        76
           3       0.84      0.80      0.81        83
           4       0.83      0.67      0.74        79

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



In [42]:
preds

array([4, 4, 0, 0, 0, 4, 1, 0, 1, 1, 1, 4, 4, 0, 2, 0, 3, 1, 2, 2, 0, 3,
       0, 0, 3, 0, 1, 1, 2, 2, 2, 4, 2, 1, 1, 0, 0, 1, 1, 4, 2, 0, 2, 1,
       2, 1, 3, 0, 3, 1, 2, 3, 2, 2, 2, 4, 3, 3, 2, 3, 4, 0, 0, 2, 4, 0,
       4, 3, 3, 2, 2, 1, 3, 3, 4, 2, 1, 2, 0, 0, 2, 3, 1, 2, 3, 0, 0, 1,
       3, 0, 0, 3, 1, 0, 0, 0, 1, 4, 3, 0, 0, 1, 1, 3, 0, 3, 0, 1, 0, 1,
       2, 0, 3, 0, 3, 3, 3, 3, 3, 1, 0, 0, 2, 0, 1, 1, 0, 1, 3, 4, 4, 1,
       4, 1, 3, 1, 3, 2, 0, 2, 0, 3, 2, 2, 1, 3, 1, 4, 3, 0, 4, 0, 2, 0,
       0, 1, 4, 3, 0, 3, 3, 3, 1, 4, 1, 2, 2, 4, 4, 2, 1, 2, 2, 4, 4, 0,
       2, 3, 2, 3, 0, 2, 0, 0, 1, 1, 0, 2, 3, 0, 1, 2, 1, 3, 2, 2, 3, 2,
       0, 0, 3, 4, 1, 4, 3, 1, 3, 4, 1, 0, 1, 1, 0, 1, 4, 3, 4, 2, 1, 3,
       0, 0, 0, 4, 2, 2, 0, 2, 4, 2, 3, 3, 1, 4, 2, 4, 0, 0, 2, 2, 4, 2,
       4, 0, 0, 4, 2, 2, 0, 4, 4, 1, 0, 1, 1, 2, 3, 1, 4, 1, 3, 2, 1, 3,
       3, 1, 1, 4, 3, 2, 3, 4, 3, 1, 3, 0, 3, 0, 1, 1, 1, 2, 4, 2, 1, 1,
       3, 0, 4, 1, 1, 1, 3, 3, 2, 1, 3, 4, 4, 0, 1,

In [43]:
preds = trainer.predict(tokenized_test)
preds = np.argmax(preds[:3][0],axis=-1)
GT = df_test['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.56      0.62      0.59        16
           1       0.59      0.59      0.59        22
           2       0.65      0.62      0.64        24
           3       0.45      0.59      0.51        17
           4       0.73      0.52      0.61        21

    accuracy                           0.59       100
   macro avg       0.60      0.59      0.59       100
weighted avg       0.61      0.59      0.59       100



# 7. Model Inference

In [45]:
from transformers import pipeline

In [46]:
classifier = pipeline("text-classification",
                      model="./news_classifier_model",
                      tokenizer="./news_classifier_model"
                      )

HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './news-classifier-model'.

In [57]:
classifier = pipeline("text-classification",
                      model="mgulati3/news-classifier-model",
                      tokenizer="mgulati3/news-classifier-model"
                      )

Device set to use cuda:0


In [58]:
example_article = """The Trump administration is formally shutting down the United States Agency for International Development today, after cancelling 83% of its programs earlier this year.

The administration says the agency has misspent billions in funds and "has little to show since the end of the Cold War."

That argument clashes with a new study published Monday in the medical journal, the Lancet. The study estimates that USAID programs have saved over 90 million lives over the past two decades. The researchers also estimate that if the current cuts continue through 2030, 14 million people who might have otherwise lived could die.
"""

In [59]:
result = classifier(example_article)
print(result)

[{'label': 'health', 'score': 0.74128258228302}]
