In [188]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [189]:
import json
import pandas as pd
# Opening JSON file
f = open('/home/ferdinand_t/data/CMS_2010_to_June_2022_ENGLISH.json')
# returns JSON object as
# a dictionary
data = json.load(f)
df = pd.DataFrame.from_dict(data)

In [190]:
df['cleanFocusCategory'] = df['thematicFocusCategory'].apply(lambda x: x['name'] if x is not None else x)
df = df[['keywordStrings', 'cleanFocusCategory']]
df_clean = df.copy()
df_clean = df_clean.dropna()

In [285]:
df_clean

Unnamed: 0,keywordStrings,cleanFocusCategory
8762,"[Africalink, Top Story, Africa on the Move, Ma...",History
31542,"[Commerzbank, job cuts, administration, retail...",Business
31543,"[Moody's, Turkey, ratings agency, junk status]",Business
31544,"[Syria, Aleppo, war crimes, water, UNICEF]",Conflicts
31545,"[Conflict Zone, Talk, link]",Politics
...,...,...
175654,"[Turkey, Recep Tayipp Erdogan, Finland Sweden,...",Politics
175655,"[pollution, gold mine, Turkey, Anagold, cyanide]",Nature and Environment
175656,"[war, Ukraine, Russia, Russian attack, soldier...",Conflicts
175657,"[France, vegetarian, steak, sausage]",Business


In [286]:
df_clean_sentence = df_clean.copy()
df_clean_sentence['keywordStrings'] = [' '.join(map(str, l)) for l in df_clean['keywordStrings']]
df_clean_sentence['cleanFocusCategory'] = df_clean['cleanFocusCategory']

In [287]:
df_clean_sentence

Unnamed: 0,keywordStrings,cleanFocusCategory
8762,Africalink Top Story Africa on the Move Making...,History
31542,Commerzbank job cuts administration retail ban...,Business
31543,Moody's Turkey ratings agency junk status,Business
31544,Syria Aleppo war crimes water UNICEF,Conflicts
31545,Conflict Zone Talk link,Politics
...,...,...
175654,Turkey Recep Tayipp Erdogan Finland Sweden NAT...,Politics
175655,pollution gold mine Turkey Anagold cyanide,Nature and Environment
175656,war Ukraine Russia Russian attack soldiers con...,Conflicts
175657,France vegetarian steak sausage,Business


In [261]:
def preprocess_function(examples):
    return tokenizer(examples["keywordStrings"], truncation=True, padding=True)

In [262]:
# dict_clean = df_clean.to_dict()
# tokenized_dict_clean = dict_clean.map(preprocess_function, batched=True)

In [263]:
# s = df_clean.columns.to_series()
# new = s.groupby(s).cumcount().astype(str).radd('_').replace('_0','')
# df_clean.columns += new
# print (df_clean)

In [289]:
from datasets import Dataset
label2id  = {"Architecture":0, "Arts":1, "Business":2, "Cars and Transportation":3, "Catastrophe":4, "Climate":5, "Conflicts":6, "Corruption":7, "Crime":8, "Culture":9, "Dance":10, "Design":11, "Digital World":12, "Diversity":13, "Education":14, "Equality":15, "Film":16, "Food Security":17, "Freedom of Speech":18, "Globalization":19, "Health":20, "History":21, "Human Rights":22, "Innovation":23, "Law and Justice":24, "Learning German":25, "Lifestyle":26, "Literature":27, "Media":28, "Migration":29, "Music":30, "Nature and Environment":31, "Offbeat":32, "Politics":33, "Press Freedom":34, "Religion":35, "Rule of Law":36, "Science":37, "Soccer":38, "Society":39, "Sports":40, "Technology":41, "Terrorism":42, "Theater":43, "Trade":44, "Travel":45}
df_clean_sentence["cleanFocusCategory"] = [label2id[label] for label in df_clean_sentence["cleanFocusCategory"]]




In [282]:
label2id["Architecture"]

0

In [290]:
df_clean_sentence

Unnamed: 0,keywordStrings,cleanFocusCategory
8762,Africalink Top Story Africa on the Move Making...,21
31542,Commerzbank job cuts administration retail ban...,2
31543,Moody's Turkey ratings agency junk status,2
31544,Syria Aleppo war crimes water UNICEF,6
31545,Conflict Zone Talk link,33
...,...,...
175654,Turkey Recep Tayipp Erdogan Finland Sweden NAT...,33
175655,pollution gold mine Turkey Anagold cyanide,31
175656,war Ukraine Russia Russian attack soldiers con...,6
175657,France vegetarian steak sausage,2


In [291]:
dataset_clean = Dataset.from_pandas(df_clean_sentence).train_test_split(test_size=0.33)

train_dataset = dataset_clean["train"]
val_dataset = dataset_clean["test"]
#dataset_clean =  Dataset.from_pandas(df_clean)
tokenized_dict_clean = dataset_clean.map(preprocess_function, batched=True)

Map:   0%|          | 0/53541 [00:00<?, ? examples/s]

Map:   0%|          | 0/26372 [00:00<?, ? examples/s]

In [307]:
train_dataset[0]

{'keywordStrings': 'Beethovenfest Ludwig van Beethoven National Youth Orchestra of Germany Youth Orchestra of Ukraine Oksana Lyniv classical music festival',
 'cleanFocusCategory': 30,
 '__index_level_0__': '140764'}

In [293]:
tokenized_dict_clean

DatasetDict({
    train: Dataset({
        features: ['keywordStrings', 'cleanFocusCategory', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 53541
    })
    test: Dataset({
        features: ['keywordStrings', 'cleanFocusCategory', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 26372
    })
})

In [294]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [295]:
#data_collator

In [296]:
import evaluate

accuracy = evaluate.load("accuracy")

In [297]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [298]:
id2label = {0: "Architecture", 1: "Arts", 2: "Business", 3: "Cars and Transportation", 4: "Catastrophe", 5:"Climate", 6: "Conflicts", 7:"Corruption", 8:"Crime", 9:"Culture", 10:"Dance", 11:"Design", 12:"Digital World", 13:"Diversity", 14:"Education", 15:"Equality", 16:"Film", 17:"Food Security", 18:"Freedom of Speech", 19:"Globalization", 20:"Health", 21:"History", 22:"Human Rights", 23:"Innovation", 24:"Law and Justice", 25:"Learning German", 26:"Lifestyle", 27:"Literature", 28:"Media", 29:"Migration", 30:"Music", 31:"Nature and Environment", 32:"Offbeat", 33:"Politics", 34:"Press Freedom", 35:"Religion", 36:"Rule of Law", 37:"Science", 38:"Soccer", 39:"Society", 40:"Sports", 41:"Technology", 42:"Terrorism", 43:"Theater", 44:"Trade", 45:"Travel"}
label2id  = {"Architecture":0, "Arts":1, "Business":2, "Cars and Transportation":3, "Catastrophe":4, "Climate":5, "Conflicts":6, "Corruption":7, "Crime":8, "Culture":9, "Dance":10, "Design":11, "Digital World":12, "Diversity":13, "Education":14, "Equality":15, "Film":16, "Food Security":17, "Freedom of Speech":18, "Globalization":19, "Health":20, "History":21, "Human Rights":22, "Innovation":23, "Law and Justice":24, "Learning German":25, "Lifestyle":26, "Literature":27, "Media":28, "Migration":29, "Music":30, "Nature and Environment":31, "Offbeat":32, "Politics":33, "Press Freedom":34, "Religion":35, "Rule of Law":36, "Science":37, "Soccer":38, "Society":39, "Sports":40, "Technology":41, "Terrorism":42, "Theater":43, "Trade":44, "Travel":45}

In [299]:
from transformers import create_optimizer
import tensorflow as tf
from sklearn.model_selection import train_test_split

#df_train, df_test = train_test_split(dataset_clean, random_state=0, test_size=0.33)
batch_size = 16
num_epochs = 5
batches_per_epoch = len(train_dataset) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [300]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=46, id2label=id2label, label2id=label2id
)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_transform', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_259', 'pre_classifier', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use 

In [301]:
#dataset = tf.data.Dataset.from_tensor_slices((tf.ragged.constant(tokenized_dict_clean['keywordStrings']), tokenized_dict_clean['cleanFocusCategory']))
#tf_train_set, tf_validation_set = train_test_split(dataset, random_state=0, test_size=0.33)
#tf_train_set, tf_validation_set = df_train, df_test
#tf_train_set = df_train
#tf_validation_set = df_test




In [302]:
tokenized_dict_clean["train"]

Dataset({
    features: ['keywordStrings', 'cleanFocusCategory', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 53541
})

In [303]:
samples = tokenized_dict_clean["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["keywordStrings"]}


In [304]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'cleanFocusCategory': TensorShape([8]),
 '__index_level_0__': TensorShape([8]),
 'input_ids': TensorShape([8, 46]),
 'attention_mask': TensorShape([8, 46])}

In [319]:
tf_train_dataset = tokenized_dict_clean["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["cleanFocusCategory"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = tokenized_dict_clean["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["cleanFocusCategory"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

In [320]:
# tf_train_set = model.prepare_tf_dataset(
#     tokenized_dict_clean["train"],
#     shuffle=True,
#     batch_size=16,
#     collate_fn=data_collator,
# )

# tf_validation_set = model.prepare_tf_dataset(
#     tokenized_dict_clean["test"],
#     shuffle=False,
#     batch_size=16,
#     collate_fn=data_collator,
# )

In [321]:
# #dataset = tf.data.Dataset.range(10)
# take = int(len(dataset)/2)

# tf_validation_set = dataset.take(take)
# #print('test:', list(tf_validation_set.as_numpy_iterator()))
# tf_train_set = dataset.skip(take)
# #print('valid:', list(dataset.as_numpy_iterator()))

In [322]:
len(dataset)

79913

In [323]:
len(tf_validation_set)

1649

In [324]:
len(tf_train_set)

3346

In [325]:
import tensorflow as tf

model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [327]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_dataset)

In [328]:
tf_validation_set

<PrefetchDataset element_spec={'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}>

In [329]:
callbacks = metric_callback

In [330]:
model.fit(x=tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=callbacks)

Epoch 1/3
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


TypeError: in user code:

    File "/home/ferdinand_t/venv/lib64/python3.7/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/home/ferdinand_t/venv/lib64/python3.7/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/ferdinand_t/venv/lib64/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py", line 1316, in run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    File "/home/ferdinand_t/venv/lib64/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py", line 2895, in call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    File "/home/ferdinand_t/venv/lib64/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py", line 3696, in _call_for_each_replica
        return fn(*args, **kwargs)
    File "/home/ferdinand_t/venv/lib64/python3.7/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/home/ferdinand_t/venv/lib64/python3.7/site-packages/transformers/modeling_tf_utils.py", line 1557, in train_step
        else:
    File "/home/ferdinand_t/venv/lib64/python3.7/site-packages/keras/optimizers/optimizer_experimental/optimizer.py", line 526, in minimize
        grads_and_vars = self.compute_gradients(loss, var_list, tape)
    File "/home/ferdinand_t/venv/lib64/python3.7/site-packages/keras/optimizers/optimizer_experimental/optimizer.py", line 259, in compute_gradients
        grads = tape.gradient(loss, var_list)
    File "/home/ferdinand_t/venv/lib64/python3.7/site-packages/tensorflow/python/eager/backprop.py", line 1070, in gradient
        raise TypeError("Argument `target` should be a list or nested structure"

    TypeError: Argument `target` should be a list or nested structure of Tensors, Variables or CompositeTensors to be differentiated, but received None.


In [None]:
tf.debugging.disable_traceback_filtering()