In [None]:
# Upgrade pip and install ktrain
!pip -qq install -U pip
!pip -qq install ktrain

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.3/25.3 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m653.6/653.6 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m88.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.8/468.8 kB[0m [31m44.6 MB/s[0m eta [36m0

In [None]:
# Import libaries
import numpy as np
import pandas as pd
import random
import os
import re
import ktrain
from ktrain import text
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')


In [None]:

# Set seed
SEED = 3031

def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)

def set_global_determinism(seed=SEED):
    set_seeds(seed=seed)

    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

    tf.config.threading.set_inter_op_parallelism_threads(1)
    tf.config.threading.set_intra_op_parallelism_threads(1)

set_global_determinism(seed=SEED)

In [None]:
# Load data
train = pd.read_csv('/content/Train.csv')
test = pd.read_csv('/content/Test.csv')
sample = pd.read_csv('/content/SampleSubmission.csv')

In [None]:
# Preview last five rows in test
test.tail()

Unnamed: 0,swahili_id,content
1025,53332d2e0363433b554581a07239a648c5bccaff,"MKURUGENZI wa Jiji la Arusha, Dk Maulid Maden..."
1026,9362cab7b1f02ae1e56b9b62b23f16bddc55b1b5,Hayo yalisemwa na Katibu Mkuu Wizara ya Afrik...
1027,6cd87e2b10f1c509ad970c519843efd4fea963fa,Alitoa kauli hiyo juzi wakati akizungumza na ...
1028,361806eaeeb8e109ee70a5a7682afb4c62db6a6b,SIMBA imetinga raundi ya nne ya 32 bora ya Ko...
1029,b75e1b6f971f0b6c238855dcf6fdc970ef114153,"KOCHA wa Azam FC, Mholanzi Hans van der Pluij..."


In [None]:
# Remove trailing spaces, new lines and tab spaces from data
train.content = train.content.apply(lambda x: (re.sub('\s+',' ', x)).strip())
test.content = test.content.apply(lambda x: (re.sub('\s+',' ', x)).strip())
test.tail()


Unnamed: 0,swahili_id,content
1025,53332d2e0363433b554581a07239a648c5bccaff,"MKURUGENZI wa Jiji la Arusha, Dk Maulid Madeni..."
1026,9362cab7b1f02ae1e56b9b62b23f16bddc55b1b5,Hayo yalisemwa na Katibu Mkuu Wizara ya Afrika...
1027,6cd87e2b10f1c509ad970c519843efd4fea963fa,Alitoa kauli hiyo juzi wakati akizungumza na v...
1028,361806eaeeb8e109ee70a5a7682afb4c62db6a6b,SIMBA imetinga raundi ya nne ya 32 bora ya Kom...
1029,b75e1b6f971f0b6c238855dcf6fdc970ef114153,"KOCHA wa Azam FC, Mholanzi Hans van der Pluijm..."


In [None]:

# Set model parameters
MODEL_NAME = 'xlm-roberta-base'
MAX_LEN = 256
BATCH_SIZE = 16
FOLDS = 3
LR = 3e-5
EPOCHS = 2

# List of class names
CLASS_NAMES = sorted(train.category.unique().tolist()) # ['afya', 'burudani', 'kimataifa', 'kitaifa', 'michezo', 'uchumi']

# Instantiate transformer with the provided parameters
t = text.Transformer(model_name=MODEL_NAME, maxlen=MAX_LEN, class_names=CLASS_NAMES, batch_size=BATCH_SIZE)

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/512 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

In [None]:
%%time
# Prepare test data
test_data = np.asarray(test.content)

# Set number of folds to 3
folds = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

# List to store predictions and loss-score per fold
oof_preds = []
oof_loss_score = []

for train_index, test_index in folds.split(train.content, train.category):
  X_train, X_test = list(train.loc[train_index, 'content']), list(train.loc[test_index, 'content'])
  y_train, y_test = np.asarray(train.loc[train_index, 'category']), np.asarray(train.loc[test_index, 'category'])

  # Preprocess training and validation data
  train_set = t.preprocess_train(X_train, y_train)
  val_set = t.preprocess_test(X_test, y_test)

  # Instantiate model
  model = t.get_classifier()
  learner = ktrain.get_learner(model, train_data=train_set, val_data=val_set, batch_size=BATCH_SIZE)

  # Train model
  history = learner.fit(LR, n_cycles=EPOCHS, checkpoint_folder='/tmp')
  learner.validate(class_names=t.get_classes())

  # Append score of each fold
  oof_loss_score.append(history.history['val_loss'][-1])

  # Make predictions
  preds = ktrain.get_predictor(learner.model, preproc=t).predict(test_data, return_proba=True)

  # Append preds to oof_preds list
  oof_preds.append(preds)

# Check cv score and prepare submission file
print(f'Mean Loss: {np.mean(oof_loss_score)}')
sub = pd.DataFrame(np.mean(oof_preds, axis=0), columns = t.get_classes())
sub['test_id'] = test.id
sub = sub[sample.columns]
sub.to_csv('Submission.csv', index = False)


preprocessing train...
language: sw
train sequence lengths:
	mean : 314
	95percentile : 693
	99percentile : 1075


Is Multi-Label? False
preprocessing test...
language: sw
test sequence lengths:
	mean : 316
	95percentile : 666
	99percentile : 1103


Epoch 1/2


ResourceExhaustedError: ignored