# **CNN Model**

# **Data Format:**
# **Columns: Text, Label**
## **Label: tuple of labels - Representing the multiclass classification**

|  TEXT  |         LABEL        | 
|--------|----------------------|
| TEXT_1 |  (LABEL_1, LABEL_3)  |
| TEXT_2 |  (LABEL_1)           |
| TEXT_3 |  (LABEL_2, LABEL_4)) |

# **GPU Configuration**

In [None]:
import tensorflow as tf
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";
tf.config.list_physical_devices('GPU')
tf.test.is_built_with_cuda()

True

# **Load Data**

In [None]:
from google.colab import files
files.upload()

In [None]:
import pandas as pd

csv_path = ""
data = pd.read_csv(csv_path)
data.head(5)

# **Label Analysis**

In [None]:
df = data.copy()

In [None]:
import plotly.express as px
df_test = df.label.apply(lambda x: str(x))
fig = px.histogram(df_test, x="label")
fig.show()

In [None]:
import random

def reduceData(df, n=10000000):
    dic = df.groupby(by="label").groups
    selected_texts = []
    selected_labels = []
    selected_id = []
    for k in dic.keys():
        if (len(dic[k]) > n):
            dic[k] = random.sample(list(dic[k]), n)
        for i in dic[k]:
            selected_labels.append(k)
            selected_texts.append(df.text[i])
    return pd.DataFrame(data={"text": selected_texts, "label": selected_labels})

df = reduceData(df, 500)

In [None]:
df_test = df.label.apply(lambda x: str(x))
fig = px.histogram(df_test, x="label")
fig.show()

# **Text Preprocessing**

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def removePunctuation(t):
    punc = string.punctuation
    for e in t:
        if e in punc:
            t = t.replace(e, "")
    return t

def removeStopWordsStemmer(sentence, ps):
    sentence_tokens = word_tokenize(sentence)
    sentence_without_sw = [word for word in sentence_tokens if not word in stopwords.words()]
    sentence_without_sw = list(map(lambda s: ps.stem(s), sentence_without_sw))
    return " ".join(sentence_without_sw)

ps = PorterStemmer()
df.text = df.text.apply(lambda t: t.lower())
df.text = df.text.apply(lambda t: removePunctuation(t))
df.text = df.text.apply(lambda t: removeStopWordsStemmer(t, ps))

In [None]:
import functools
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

def textPreprocessing(texts):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    texts = tokenizer.texts_to_sequences(texts)
    vocab_size = len(tokenizer.word_index)
    max = len(functools.reduce(lambda a, b: a if len(a) > len(b) else b, texts))
    texts = pad_sequences(texts, maxlen = max, padding= "pre")
    return texts, vocab_size, max

texts, vocab_size, max_ = textPreprocessing(list(df.text.values))

In [None]:
df.text = list(texts)
df.head(5)

# **Labels Preprocessing**

In [None]:
df.label = df.label.apply(lambda x: (x.split(".")[2],))
df.head(5)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

def labelsPreprocessing(labels):
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(labels)
    return labels, mlb

labels, encoder = labelsPreprocessing(list(df.label.values))

In [None]:
df.label = list(labels)
df.label = df.label.apply(lambda x: tuple(x))
df.head(5)

# **Treating Unbalanced Data**

In [None]:
#from imblearn.under_sampling import NearMiss
#undersample = NearMiss(version=1, n_neighbors=1)
#texts_tr, labels_tr = undersample.fit_resample(texts, labels)
#len(texts_tr), len(labels_tr)

In [None]:
#from imblearn.under_sampling import TomekLinks
#tl = TomekLinks(sampling_strategy='majority')
#texts_tr, labels_tr = tl.fit_resample(texts, labels)
#len(texts_tr), len(labels_tr)

In [None]:
#from imblearn.under_sampling import NeighbourhoodCleaningRule
#undersample = NeighbourhoodCleaningRule(n_neighbors=500, threshold_cleaning=0.5)
#texts_tr, labels_tr = undersample.fit_resample(texts, labels)
#len(texts_tr), len(labels_tr)

In [None]:
#from imblearn.under_sampling import OneSidedSelection
#undersample = OneSidedSelection(n_neighbors=1, n_seeds_S=200)
#texts_tr, labels_tr = undersample.fit_resample(texts, labels)
#len(texts_tr), len(labels_tr)

In [None]:
#from imblearn.under_sampling import EditedNearestNeighbours
#undersample = EditedNearestNeighbours(n_neighbors=4)
#texts_tr, labels_tr = undersample.fit_resample(texts, labels)
#len(texts_tr), len(labels_tr)

In [None]:
#from imblearn.under_sampling import CondensedNearestNeighbour
#undersample = CondensedNearestNeighbour(n_neighbors=1)
#texts_tr, labels_tr = undersample.fit_resample(texts, labels)
#len(texts_tr), len(labels_tr)

# **Training Model**

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from sklearn.metrics import classification_report
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import matthews_corrcoef
import numpy as np
from sklearn.metrics import multilabel_confusion_matrix

def print_confusion_matrix(confusion_matrix, axes, class_label, class_names, fontsize=20):

    df_cm = pd.DataFrame(confusion_matrix, index=class_names, columns=class_names)

    sn.set(font_scale=1.4)

    try:
        heatmap = sn.heatmap(df_cm, annot=True, fmt="d", cbar=False, ax=axes, cmap='Blues')
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
        
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    axes.set_ylabel('True label')
    axes.set_xlabel('Predicted label')
    axes.set_title("Confusion Matrix for the class - " + class_label)

def plotMultiLabelConfusionMatrices(cms, labels):
    fig, ax = plt.subplots(1, 2, figsize=(20, 10))
    for axes, conf_matrix, label in zip(ax.flatten(), cms, labels):
        print_confusion_matrix(conf_matrix, axes, label, ["N", "Y"])

def generateModel(vocab_size, input_length, output_length):
    embedding_dim = 100
    model = Sequential()
    model.add(layers.Embedding(vocab_size + 1, embedding_dim, input_length=input_length))
    model.add(layers.Conv1D(128, 5, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(output_length, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    model.summary()
    return model
    
def defineLabel(prediction, threshold):
  r = []
  for p in prediction:
    r += [0] if p < threshold else [1]
  return tuple(r)

def runModel(texts, labels, vocab_size, input_length, output_length, encoder, epochs, class_names):
    X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=3)
    X_train, X_test = np.array(X_train), np.array(X_test)
    y_train, y_test = np.array(y_train), np.array(y_test)

    model = generateModel(vocab_size, input_length, output_length)
    model.fit(X_train, y_train, epochs=epochs, batch_size=10)
    
    y_pred = []
    for prediction in model.predict(X_test):
        prediction = list(prediction)
        pred_t = defineLabel(prediction, 0.5)
        y_pred.append(pred_t)
    
    print(classification_report(y_test, y_pred, target_names = class_names))
    
    cms = multilabel_confusion_matrix(y_test, y_pred)
    labels = class_names # Labels in alphabetic order
    plotMultiLabelConfusionMatrices(cms, labels)

In [None]:
output_size = len(labels[0])
class_names = ['a', 'b', 'c', 'd', 'e', 'f']
runModel(list(df.text.values), list(df.label.values), vocab_size, max_, output_size, encoder, 2, class_names)