In [None]:
# Oriol
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
!cp "/content/drive/My Drive/Colab Notebooks/NLP/data/archive.zip" .
!cp "/content/drive/My Drive/Colab Notebooks/NLP/helper_functions.py" .

In [None]:
# Import Transformers
!pip install --upgrade transformers
!pip install simpletransformers

In [None]:
# Import Libraries
import logging
import numpy as np
import pandas as pd
import zipfile
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel

In [None]:
# Loading data to a pandas dataframe
with zipfile.ZipFile('archive.zip') as myJson:
    with myJson.open('News_Category_Dataset_v2.json') as z:
        data = pd.read_json(z, lines=True)

In [None]:
from helper_functions import preprocess
df,_ = preprocess(data)
df.head()

In [None]:
from helper_functions import largest_classes

df,_ = largest_classes(df, 2)

In [None]:
df['label'] = pd.factorize(df['label'], sort=True)[0]
df.head()

In [None]:
# Split train and test
train_df, test_df = train_test_split(df, train_size=20000, test_size=2000, stratify=df['label'])

print(train_df.shape)
print(test_df.shape)

In [None]:
# Model Arguments - 
model_args = ClassificationArgs()
model_args.manual_seed = 42
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.use_multiprocessing = True
model_args.num_train_epochs = 2

# Model 
model = ClassificationModel(model_type='bert', model_name='bert-base-cased', use_cuda=True, num_labels=2, args=model_args)

In [None]:
# Train the model
model.train_model(train_df)

In [None]:
from sklearn.metrics import f1_score, accuracy_score, recall_score

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='macro')

def recall_multiclass(labels, preds):
    return recall_score(labels, preds, average='macro')

result, _, _ = model.eval_model(test_df, f1=f1_multiclass, acc=accuracy_score, rec=recall_multiclass)

In [None]:
print(result)

In [None]:
test_l = list(test_df['text'])

In [None]:
preds,_ = model.predict(test_l)

In [None]:
preds