In [2]:
%load_ext autoreload
%autoreload 2

# Introduction

# imports

In [1]:
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

import torch
import pandas as pd

from news_classifier.data import assign_rows_to_split
from news_classifier.constants import DATASET, TRAIN, TEST, VALID, LABEL_COL

# notebook wide constants
HOME_DIR = Path('..')
DATA_DIR = HOME_DIR / 'data'

## Process Raw Data

In [3]:
# read data
df = pd.read_csv(DATA_DIR / 'news.csv')
df.head()

Unnamed: 0,category,title
0,Business,Wall St. Bears Claw Back Into the Black (Reuters)
1,Business,Carlyle Looks Toward Commercial Aerospace (Reu...
2,Business,Oil and Economy Cloud Stocks' Outlook (Reuters)
3,Business,Iraq Halts Oil Exports from Main Southern Pipe...
4,Business,"Oil prices soar to all-time record, posing new..."


In [4]:
# use the assign_rows_to_split to split rows into either train,test or valid while stratifying wrt the categories
splitted_df = assign_rows_to_split(df)

train_rows = splitted_df.query(f'{DATASET}=="{TRAIN}"')
valid_rows = splitted_df.query(f'{DATASET}=="{VALID}"')
test_rows = splitted_df.query(f'{DATASET}=="{TEST}"')

print(f'Train size {train_rows.size:_}, Valid size {valid_rows.size:_}, Test size {test_rows.size:_}')

Train size 252_000, Valid size 54_000, Test size 54_000


In [5]:
train_rows[LABEL_COL].value_counts()

World       21000
Sports      21000
Business    21000
Sci/Tech    21000
Name: category, dtype: int64

In [6]:
valid_rows[LABEL_COL].value_counts()

World       4500
Business    4500
Sci/Tech    4500
Sports      4500
Name: category, dtype: int64

In [7]:
# write splitted file to disk
fp = DATA_DIR / 'news_splitted.csv'
splitted_df.to_csv(fp, index=False)

# Vocabulary Building

In [4]:
from news_classifier.data import Dataset, generate_batches
from news_classifier.utils import save_json

data_path = DATA_DIR / 'news_splitted.csv'
splitted_df = pd.read_csv(data_path)
dataset = Dataset.from_dataframe(splitted_df.iloc[:25_000])

vectorizer = dataset.vectorizer
vserialized_vectorizer = vectorizer.to_serializable()

vectorizer_path = HOME_DIR / 'language/vectorizer.json'
save_json(vserialized_vectorizer, vectorizer_path)

# Model Training

In [15]:
import pandas as pd

from news_classifier.data import Dataset, generate_batches
from news_classifier.models import DNN_Classifier
from news_classifier.training import Trainer

# dataset
data_path = DATA_DIR / 'news_splitted.csv'
splitted_df = pd.read_csv(data_path)
dataset = Dataset.from_dataframe(splitted_df.iloc[:25_000])

# model dimensions
input_dim = len(dataset.vectorizer.headlines_vocab)
nb_categories = len(dataset.vectorizer.labels_vocab)

# create model
model = DNN_Classifier(input_dim=input_dim, nb_categories=nb_categories)
model_dir = HOME_DIR / 'models/DNN_Classifier'

# loss function
loss_func = torch.nn.CrossEntropyLoss()

# optimizer
learning_rate = 0.001
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

# device
device = "cuda" if torch.cuda.is_available() else "cpu"

# create trainer object
trainer = Trainer(
    data_loader=generate_batches, 
    optimizer=optimizer, 
    model=model, 
    model_dir=model_dir, 
    loss_func=loss_func, 
    device=device
)

# training params
batch_size = 32
epochs = 5
trainer.run(nb_epochs=epochs, dataset=dataset, batch_size=batch_size, checkpoint=True)

=> Saving checkpoint
Completed Epoch 0 with average loss of 1.04
=> Saving checkpoint
Completed Epoch 1 with average loss of 0.84
=> Saving checkpoint
Completed Epoch 2 with average loss of 0.80
=> Saving checkpoint
Completed Epoch 3 with average loss of 0.78
=> Saving checkpoint
Completed Epoch 4 with average loss of 0.77


# Inference

In [7]:
from news_classifier.utils import load_json, get_latest_model_checkpoint, load_checkpoint
from news_classifier.language import Vectorizer
from news_classifier.models import DNN_Classifier

# headline
headline = 'the team won a grant'

# vectorizer
headline_vocab_path = HOME_DIR / 'language/vectorizer.json'
headlines_vectorizer = load_json(headline_vocab_path)
vectorizer = Vectorizer.from_serializable(headlines_vectorizer)
vectorized = vectorizer.vectorize_headline(headline)

# embed in a batch
infer_batch = torch.tensor(vectorized).unsqueeze(0)

# select model
input_dim = len(vectorizer.headlines_vocab)
nb_categories = len(vectorizer.labels_vocab)
model = DNN_Classifier(input_dim=input_dim, nb_categories=nb_categories)

# load latest checkpoints
MODEL_DIR = HOME_DIR / 'models/DNN_Classifier'
latest_checkpoint = get_latest_model_checkpoint(MODEL_DIR)
checkpoint_state = torch.load(latest_checkpoint, map_location=torch.device('cpu'))
load_checkpoint(checkpoint_state, model)

# inference
prediction = model(infer_batch)

# pick most liklely index
index = torch.argmax(prediction).item()
label_string = vectorizer.labels_vocab.lookup_index(index)

print(f' Predicted Category: {label_string}')

 Predicted Category: Sports
