# Distilbert Model

In [None]:
import tensorflow as tf
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";
tf.config.list_physical_devices('GPU')
tf.test.is_built_with_cuda()

# Load Data

In [None]:
import pandas as pd

csv_path = ""
data = pd.read_csv(csv_path)
data.head(5)

In [None]:
data.columns = ['text', 'label']
data.head(5)

# Translate Sentence To English - Optional

In [None]:
#!pip install deep_translator

In [None]:
from deep_translator import GoogleTranslator
translator = GoogleTranslator(source='auto', target='en')

def translateSplittedTopics(sentence, translator):
  return translator.translate(sentence)

data.text = data.text.apply(lambda x: translateSplittedTopics(x, translator))

In [None]:
data.label = data.label.apply(lambda x: (x,) if type(x) != tuple else x)
data.head(5)

In [None]:
df = data.copy()

In [None]:
import numpy as np
df.label.value_counts()

# Label Analysis

In [None]:
#!pip install pyyaml==5.4.1

In [None]:
import plotly.express as px
df_test = df.label.apply(lambda x: str(x))
fig = px.histogram(df_test, x="label")
fig.show()

# Reduce Data

In [None]:
import random

def reduce_data(df, n=10000000):
    dic = df.groupby(by="label").groups
    selected_texts = []
    selected_labels = []
    for k in dic.keys():
        if (len(dic[k]) > n):
            dic[k] = random.sample(list(dic[k]), n)
        for i in dic[k]:
            selected_labels.append(k)
            selected_texts.append(df.text[i])
    return pd.DataFrame(data={"text": selected_texts, "label": selected_labels})

df = reduce_data(df, 2000)
df.label.value_counts()

In [None]:
df_test = df.label.apply(lambda x: str(x))
fig = px.histogram(df_test, x="label")
fig.show()

# Text Preprocessing

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def removePunctuation(t):
    punc = string.punctuation
    for e in t:
        if e in punc:
            t = t.replace(e, "")
    return t

def removeStopWordsStemmer(sentence, ps):
    sentence_tokens = word_tokenize(sentence)
    sentence_without_sw = [word for word in sentence_tokens if not word in stopwords.words()]
    sentence_without_sw = list(map(lambda s: ps.stem(s), sentence_without_sw))
    return " ".join(sentence_without_sw)

ps = PorterStemmer()
df.text = df.text.apply(lambda t: t.lower())
df.text = df.text.apply(lambda t: removePunctuation(t))
df.text = df.text.apply(lambda t: removeStopWordsStemmer(t, ps))

In [None]:
class_names = []

for label in list(df.label.unique()):
  l = label[0]
  class_names += [l]

class_names

# Labels Preprocessing

In [21]:
from sklearn.preprocessing import MultiLabelBinarizer

def labelsPreprocessing(labels):
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(df['label'])
    return labels, mlb

labels, encoder = labelsPreprocessing(list(df.label.values))

In [None]:
df.label = list(labels)
df.label = df.label.apply(lambda x: tuple(x))
df.sample(5)

# Training Model

In [None]:
!pip install ktrain

In [None]:
import ktrain
from ktrain import text
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], shuffle=True, test_size = 0.2)
x_train, y_train = list(x_train), list(y_train)
x_test, y_test = list(x_test), list(y_test)

trn, val, preproc = text.texts_from_array(x_train=x_train, y_train=y_train,
                                          x_test=x_test, y_test=y_test,
                                          class_names = class_names,
                                          preprocess_mode='distilbert',
                                          maxlen=200)

# distilbert, distilbert-base-cased, distilbert-base-multilingual-cased
MODEL_NAME = 'distilbert'
model = text.text_classifier(MODEL_NAME, train_data=trn, preproc=preproc, multilabel=True) 

#model.distilbert.trainable = False
#model.pre_classifier.trainable = False
#model.classifier.trainable = False
#model = t.get_classifier(metrics=['accuracy'])
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=3)

model.summary()
learner.fit_onecycle(0.00001, 10)
predictor = ktrain.get_predictor(learner.model, preproc=preproc)
y_pred = predictor.predict(x_test)
y_pred[0:2]

In [None]:
def transformPredictions(pred, class_names):
  r = []
  for p in pred:
    nt = ()
    for c in class_names:
      nt += (1 if c in p else 0,)
    r.append(nt)
  return r
        
y_pred_t = transformPredictions(y_pred, class_names)
y_pred_t[0:2], y_test[0:2]

In [None]:
from sklearn.metrics import hamming_loss, accuracy_score, multilabel_confusion_matrix, f1_score, plot_confusion_matrix, precision_score, recall_score
import numpy as np

def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    http://stackoverflow.com/q/32239577/395857
    '''
    
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        #print('\nset_true: {0}'.format(set_true))
        #print('set_pred: {0}'.format(set_pred))
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        #print('tmp_a: {0}'.format(tmp_a))
        acc_list.append(tmp_a)
    return np.mean(acc_list)

def falsePositive(y_true, y_pred):
    total = 0
    fp = 0
    for i in range(len(y_pred)):
        for j in range(len(y_pred[i])):
            if (y_pred[i][j] == 1 and y_true[i][j] == 0):
                fp += 1
                break
        total += 1
    return fp/float(total)       

print("Accuracy Score:", accuracy_score(y_test, y_pred_t))
print("Precision Micro:", precision_score(y_test, y_pred_t, average='micro'))
print("Precision Macro:", precision_score(y_test, y_pred_t, average='macro'))
print("Recall Micro:", recall_score(y_test, y_pred_t, average='micro'))
print("Recall Macro:", recall_score(y_test, y_pred_t,average='macro'))
print("Hamming Score:", hamming_score(np.array(y_test), y_pred_t))
print("Hamming Loss:", hamming_loss(y_test, y_pred_t))
print("F1-Score Micro:", f1_score(y_test, y_pred_t, average='micro'))
print("F1-Score Macro:", f1_score(y_test, y_pred_t, average='macro'))
print("False Positives:", falsePositive(y_test, y_pred_t))
cms = multilabel_confusion_matrix(y_test, y_pred_t)

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

def print_confusion_matrix(confusion_matrix, axes, class_label, class_names, fontsize=20):

    df_cm = pd.DataFrame(confusion_matrix, index=class_names, columns=class_names)

    sns.set(font_scale=1.4)

    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d", cbar=False, ax=axes, cmap='Blues')
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
        
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    axes.set_ylabel('True label')
    axes.set_xlabel('Predicted label')
    axes.set_title("Confusion Matrix for the class - " + class_label)

def plotMultiLabelConfusionMatrices(cms, labels):
    fig, ax = plt.subplots(1, 2, figsize=(22, 10))
    for axes, conf_matrix, label in zip(ax.flatten(), cms, labels):
        print_confusion_matrix(conf_matrix, axes, label, ["N", "Y"])
        
labels = ['ALEGRIA', 'MEDO'] # Labels in alphabetic order
plotMultiLabelConfusionMatrices(cms, labels)

# Save Model

In [None]:
#Save Model

#predictor.save("./distilbert")

#Load Model

'''
import ktrain
from ktrain import text
path = "./distilbert"
loaded_predictor = ktrain.load_predictor(path)
loaded_predictor.predict(['we can close the chat'])
'''