# News Headline Classification with torchfasttext

In [23]:
!pip install -r requirements.txt

Collecting mlflow (from -r requirements.txt (line 6))
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow->-r requirements.txt (line 6))
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting Flask<4 (from mlflow->-r requirements.txt (line 6))
  Downloading flask-3.1.1-py3-none-any.whl.metadata (3.0 kB)
Collecting alembic!=1.10.0,<2 (from mlflow->-r requirements.txt (line 6))
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow->-r requirements.txt (line 6))
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow->-r requirements.txt (line 6))
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow->-r requirements.txt (line 6))
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting markdown<4,>=3.3 (from mlflow->-r requirements.txt (line 6))
  Do

## Get the data

In [13]:
import pandas as pd
df = pd.read_parquet("https://minio.lab.sspcloud.fr/h4njlg/public/ag_news_full_1M.parquet")


In [14]:
df['category'].value_counts()

category
World                        186896
Sci/Tech                     154869
Business                     146646
Entertainment                137437
Italia                       133428
Top News                     126514
Sports                       118131
Europe                        90573
Top Stories                   61579
U.S.                          47707
Health                        42629
Software and Developement     19041
Toons                          8016
Music Feeds                    7632
Name: count, dtype: int64

## Prepping the data 


Let's merge some categories

In [15]:
def merge_cat(cat):
    if cat in ['World', 'Top News', 'Europe', 'Italia', 'U.S.', 'Top Stories']:
        return 'World News'
    if cat in ['Sci/Tech', 'Software and Developement', 'Toons', 'Health', 'Music Feeds']:
        return 'Tech and Stuff'
    return cat


In [16]:
df['category_final'] = df['category'].apply(lambda x: merge_cat(x))
df['category_final'].value_counts()

df[['title','description','category_final']].sample(5)

Unnamed: 0,title,description,category_final
593220,"Sgrena, Prodi sforzo congiunto",Volonta compatta per fare tutto cio che ci e p...,World News
97506,Oracle Ruling May Embolden Dealmakers (Reuters),Reuters - A federal judge may have sent a gift...,Tech and Stuff
919916,Comic for 16 Jun 2007,"<img src=""http://www.comics.com/creators/wizar...",Tech and Stuff
676380,Profit taking drives European stocks lower \n ...,FT.com - European equities turned positive by ...,World News
980182,Throttle teacher is struck off,The General Teaching Council of Wales strikes ...,World News


In [21]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

cat_encoder = LabelEncoder()

df['cat'] = cat_encoder.fit_transform(df['category_final'])
df['title_headline'] = df['title']

news_train, news_test = train_test_split(df, stratify=df['cat'], test_size=0.10, shuffle=True, random_state=42)

X = news_train['title_headline']
y = news_train['cat']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, random_state=42)

## Training a basic model 

Predic `category_final` from `title`

In [22]:
from torchFastText import torchFastText
import numpy as np

from lightning.pytorch.loggers import MLFlowLogger
mlf_logger = MLFlowLogger(experiment_name="ag_headline_classification")

torchft_model = torchFastText( 
        num_tokens=2000,
        embedding_dim=50,
        min_count=2,
        min_n=2,
        max_n=3,
        len_word_ngrams=4,
        sparse=False,
    )
    
    # Train the model
torchft_model.train(
          np.asarray(X_train),
          np.asarray(y_train),
          np.asarray(X_test),
          np.asarray(y_test),
          lr=0.01,
          num_epochs=5,
          batch_size=32,
          trainer_params={'enable_progress_bar': True}
    )


2025-05-21 08:04:58 - torchFastText.utilities.checkers - No categorical_vocabulary_sizes. It will be inferred later.
2025-05-21 08:04:58 - torchFastText.model.pytorch_model - num_rows is different from the number of tokens in the tokenizer. Using provided num_rows.
2025-05-21 08:04:58 - torchFastText.torchFastText - No scheduler parameters provided. Using default parameters (suited for ReduceLROnPlateau).
2025-05-21 08:04:58 - torchFastText.datasets.dataset - Creating DataLoader with 12 workers.
2025-05-21 08:04:58 - torchFastText.datasets.dataset - Creating DataLoader with 12 workers.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type               | Params | Mode 
-----------------------------------------------------------
0 | model       | FastTextModel      | 7.9 M  | train
1 | loss        | CrossEntropyLoss   | 0      | train
2 | accuracy_fn | Multicla

Epoch 0:  78%|███████▊  | 21067/27024 [05:10<01:27, 67.79it/s, v_num=3, train_loss_step=0.543]


Detected KeyboardInterrupt, attempting graceful shutdown ...
Exception in thread Thread-12 (_pin_memory_loop):
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/usr/local/lib/python3.12/threading.py", line 1012, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 61, in _pin_memory_loop
    do_one_step()
  File "/usr/local/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 37, in do_one_step
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  Fi

NameError: name 'exit' is not defined

## Eval the model

In [None]:
 cat_encoder.inverse_transform(torchft_model.predict(np.asarray(['Superstar died at 20 in hotel room']))[0].reshape(-1))

In [20]:
%%time
predictions,_ = torchft_model.predict(np.asarray(news_test['title_headline']))
predictions_decoded = cat_encoder.inverse_transform(predictions.reshape(-1))
print (f"Accuracy : {(predictions_decoded.reshape(-1) == news_test['category_final']).mean():0.2%}")

KeyboardInterrupt: 

In [None]:
print (f"Accuracy : {(predictions_decoded.reshape(-1) == news_test['category_final']).mean():0.2%}")