In [1]:
%load_ext autoreload
%autoreload 2

# Introduction

# imports

In [2]:
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

import torch
import pandas as pd

from news_classifier.data import assign_rows_to_split
from news_classifier.constants import DATASET, TRAIN, TEST, VALID, LABEL_COL
# notebook wide constants
DATA_DIR = Path('../data')

## Process Raw Data

In [3]:
# read data
df = pd.read_csv(DATA_DIR / 'news.csv')
df.head()

Unnamed: 0,category,title
0,Business,Wall St. Bears Claw Back Into the Black (Reuters)
1,Business,Carlyle Looks Toward Commercial Aerospace (Reu...
2,Business,Oil and Economy Cloud Stocks' Outlook (Reuters)
3,Business,Iraq Halts Oil Exports from Main Southern Pipe...
4,Business,"Oil prices soar to all-time record, posing new..."


In [4]:
# use the assign_rows_to_split to split rows into either train,test or valid while stratifying wrt the categories
splitted_df = assign_rows_to_split(df)

train_rows = splitted_df.query(f'{DATASET}=="{TRAIN}"')
valid_rows = splitted_df.query(f'{DATASET}=="{VALID}"')
test_rows = splitted_df.query(f'{DATASET}=="{TEST}"')

print(f'Train size {train_rows.size:_}, Valid size {valid_rows.size:_}, Test size {test_rows.size:_}')

Train size 252_000, Valid size 54_000, Test size 54_000


In [5]:
train_rows[LABEL_COL].value_counts()

World       21000
Sports      21000
Sci/Tech    21000
Business    21000
Name: category, dtype: int64

In [6]:
valid_rows[LABEL_COL].value_counts()

Sci/Tech    4500
World       4500
Sports      4500
Business    4500
Name: category, dtype: int64

In [7]:
# write splitted file to disk
fp = DATA_DIR / 'news_splitted.csv'
splitted_df.to_csv(fp, index=False)

In [72]:
# training
from news_classifier.data import Dataset, generate_batches

dataset = Dataset.from_dataframe(splitted_df.iloc[:25_000])

In [73]:
from news_classifier.models import DNN_Classifier

# training params
batch_size = 32
epochs = 5

# model dimensions
input_dim = len(dataset.vectorizer.headlines_vocab)
nb_categories = len(dataset.vectorizer.labels_vocab)

# create model
model = DNN_Classifier(input_dim=input_dim, nb_categories=nb_categories)

# loss function
criterion = torch.nn.CrossEntropyLoss()

# optimizer
learning_rate = 0.001
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

for epoch in range(epochs):
    
    losses = []
    for batch_gen in generate_batches(dataset, batch_size=batch_size):
        x_in, y_true = batch_gen['x'], batch_gen['y']

        # clear gradients
        optimizer.zero_grad()

        # x_in should be  [batch_size, nb_features] 
        # y_pred should be [batch_size, out_features]
        y_pred = model(x_in)

        loss = criterion(y_pred, y_true)
        loss_batch = loss.item()
        losses.append(loss_batch)

        loss.backward()
        
        optimizer.step()
        
    avg_loss = sum(losses)/len(losses)
    print(f'Completed Epoch {epoch} with average loss of {avg_loss:.2f}')

Completed Epoch 0 with average loss of 1.04
Completed Epoch 1 with average loss of 0.85
Completed Epoch 2 with average loss of 0.80
Completed Epoch 3 with average loss of 0.78
Completed Epoch 4 with average loss of 0.78


# Inference

In [105]:
# headline
headline = 'the team won a grant'
vectorized = dataset.vectorizer.vectorize_headline(headline)

# embed in a batch
infer_batch = torch.tensor(vectorized).unsqueeze(0)
prediction = model(infer_batch)

# pick most liklely index
index = torch.argmax(prediction).item()
dataset.vectorizer.labels_vocab.lookup_index(index)

'Sci/Tech'

In [104]:
df.category.unique()

array(['Business', 'Sci/Tech', 'Sports', 'World'], dtype=object)