In [1]:
# this notebook runs on kaggle below I mention steps to follow to run this notebook
#1. I load the dataset consisiting trainset and  testset into the kaggle input folder and make the folder public inorder to use the dataset 
#2. I turn on GPu accelerator and enable internet in the kaggle platform
#3. After the enviroment setup I commit my kernel by clicking save and run(commit) in the save version button
#4. After the basic setup I start by running the code which I will briefly explain in every line of code
#5. After running this code the output showed in the public leaderbord was 0.283

In [None]:
# load basic libraries 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Viewing the path to the dataset

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
# update pip if not yet updated

!pip install -U pip

In [None]:
# install ktrain which is the wrapper for tensorflow keras that makes deep learning and Ai more accessible 

!pip install ktrain

In [None]:
# ktrain install tensor by default but if you are using other platform make sure you upgrade/downgrade tf to version 2.1.0 which ktrain runs

import tensorflow as tf
print(tf.__version__)


In [None]:
# import and use text as ktrain

import ktrain
from ktrain import text

In [None]:
#import important API for text 

import re
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.corpus import stopwords 

In [None]:
def processText(tweet):
    
  tweet = tweet.lower()                #convert text to lower-case
  tweet = re.sub('â€˜','',tweet)    # remove the text â€˜ which appears to occur flequently
  tweet = re.sub('â€™','',tweet)    # remove the text â€™ which appears to occur flequently
    

  tweet = word_tokenize(tweet)      # remove repeated characters (helloooooooo into hello)
  return ' '.join(tweet)

In [None]:
#from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold

MODEL_NAME = 'bert-base-multilingual-uncased'   # I used pretrained model 


t = text.Transformer(MODEL_NAME, maxlen= 128,  class_names=['kitaifa','michezo','biashara','kimataifa','burudani'])

In [None]:
df = pd.read_csv('/kaggle/input/Train.csv')   # load training dataset

df["category"] = df["category"].str.lower()  # convert all text to lowercase

test_path = '/kaggle/input/Test.csv'    # path to the test set in kaggle 

validation_set = pd.read_csv(test_path)
validation_set["content"] = validation_set["content"].str.lower()  # convert to lowercase





In [None]:
# appply both training and test set to processtext function

df['content'] = df['content'].apply(processText)   
validation_set['content'] = validation_set['content'].apply(processText)

In [None]:
# initialize storage variable for the results from the test set

valid_pred_ro = np.zeros((len(validation_set),5))

In [None]:
# from keras load early stopping and model checkpoint which is used to load any best previous trained model based on parameter of interest

# from sklearn load class weight 

from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from sklearn.utils import class_weight

In [None]:
# the classweight is used to assign few label to have high loss compared to other

class_weights = class_weight.compute_class_weight('balanced',np.unique(df['category']), df['category'])

class_weight_dict = dict(enumerate(class_weights)) 

In [None]:

es = EarlyStopping(monitor='val_accuracy', patience= 3 , verbose=1, restore_best_weights=True)  # parameter of interest validation accuracy and training should stop if validation accuracy is below best value for 3 consequetive episode



seed = 42
n_folds = 10   # cros validation folds by running 10 folds it will guarantee the best results from developed model


skf = StratifiedKFold(n_splits= n_folds, random_state=seed, shuffle=False)  # stratified for balanced sampling of training sample

n = 0  # Sometimes some folds produced worse results and then the model is skipped. n will guarantee the average is divided with only episodes contributing to the results

for train_index, test_index in skf.split(df['content'], df['category']):
    
    
    x_train, x_test = list(df.loc[train_index,'content']), list(df.loc[test_index,'content'])
    y_train, y_test = np.asarray(df.loc[train_index,'category']), np.asarray(df.loc[test_index,'category'])
    
    trn = t.preprocess_train(x_train, y_train)
    val = t.preprocess_test(x_test, y_test)
 
    model = t.get_classifier()
    
    learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size= 6)
    

    
    history = learner.fit(1e-5, 10, cycle_len=1, cycle_mult=2, class_weight= class_weight_dict, callbacks=[es], checkpoint_folder='/tmp')
    
    

    learner.validate(class_names=t.get_classes())



    
    if max(history.history['val_accuracy']) < 0.8: # I used any model for testing set if max(history) of validation accuracy is above or equal to 80% else continue and other CV
        continue 

    # make inference if the above condition is met 
    
    predictor = ktrain.get_predictor(learner.model,preproc=t )
    data = validation_set['content']
    data = np.asarray(data)
    print(predictor.get_classes())
    pred = predictor.predict(data,return_proba=True)
    n = n+1

    valid_pred_ro += pred

valid_pred_ro /= n

In [None]:
sub_df = pd.DataFrame(valid_pred_ro, columns= ['biashara', 'burudani', 'kimataifa', 'kitaifa', 'michezo'])  # generate dataframe to store results
sub_df['swahili_id'] = validation_set['swahili_id']

sub_df = sub_df[['swahili_id','kitaifa','michezo','biashara','kimataifa','burudani']]

In [None]:
sub_df.head()   # print results 

In [None]:
SUB_FILE_NAME = 'submission_ro.csv'
sub_df.to_csv(SUB_FILE_NAME, index=False)   # save output in the kaggle output file