In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('data.csv')

In [2]:
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [3]:
def text_lowercase(text):
    return text.lower()

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
    
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

def remove_bracket(text):
    return re.sub('\[[^]]*\]', '', text)
    
def remove_special_characters(text):
    text = text.replace('-', ' ')  
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r"\n+", " ", text)
    return text

def remove_placeholders(text):
    return re.sub(r"\[\*\*.*?\*\*\]", "", text) 

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    word_tokens = [word for word in word_tokens if word not in stop_words]
    text = ' '.join(word_tokens) # rejoin tokens into a single string
    return text

In [4]:
def clean_text(text):
    text = text_lowercase(text)
    text = remove_placeholders(text)
    text = remove_bracket(text)
    text = remove_numbers(text)
    text = remove_special_characters(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text) 
    return text    

In [5]:
data['TEXT'] = data['TEXT'].apply(clean_text)

In [6]:
def multi_label_accuracy(y_true, y_pred):
    """
    Calculate multi-label accuracy.
    
    Args:
        y_true (pd.DataFrame or np.ndarray): True binary labels.
        y_pred (pd.DataFrame or np.ndarray): Predicted binary labels.
        
    Returns:
        float: Multi-label accuracy.
    """
    # Convert DataFrame to NumPy array if needed
    if isinstance(y_true, pd.DataFrame):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame):
        y_pred = y_pred.values
    
    n_samples = y_true.shape[0]
    accuracy_sum = 0.0
    
    for i in range(n_samples):
        true_set = set(np.where(y_true[i])[0])
        pred_set = set(np.where(y_pred[i])[0])
        
        intersection = len(true_set & pred_set)
        union = len(true_set | pred_set)
        
        if union == 0:
            accuracy_sum += 1.0
        else:
            accuracy_sum += intersection / union
    
    return accuracy_sum / n_samples

In [7]:
X = data['TEXT']
y = data.iloc[:, 3:]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

print("Number of Observations in the Data set:" , X.values.shape[0]) 
print("Number of Observations in Train set:", X_train.shape[0])
print("Number of Observations in Test set:" , X_test.shape[0])   

Number of Observations in the Data set: 48335
Number of Observations in Train set: 33834
Number of Observations in Test set: 14501


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000)

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [10]:
X_train_dense = X_train_vectorized.toarray()
X_test_dense = X_test_vectorized.toarray()

In [11]:
X_train_dense.shape

(33834, 10000)

## ANN Chain Model

In [12]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from time import time

In [13]:
def create_c1node(X_feed,y_now):
    '''
    C1 node Architecture:
    attribute:512:256:1 [saperate for each class]
    loss: Binary crossentropy
    '''
    model = Sequential()
    model.add(Dense(512,activation='relu',input_shape=(X_feed.shape[1],),kernel_initializer='glorot_uniform',name='first'))
    model.add(Dropout(0.4))
    model.add(Dense(256,activation='relu',kernel_initializer='glorot_uniform',name='nretrain1'))
    model.add(Dropout(0.4))
    model.add(Dense(128,activation='relu',kernel_initializer='glorot_uniform',name='nretrain2'))
    model.add(Dropout(0.4))
    model.add(Dense(64,activation='relu',kernel_initializer='glorot_uniform',name='last'))
    model.add(Dropout(0.4))    
    model.add(Dense(y_now.shape[1],activation='sigmoid',kernel_initializer='glorot_uniform',name='output'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    return model

In [14]:
#Model chain
chain = []

#Training
X_feed = X_train_dense.copy()
t1 = time()
for i in range(y_train.shape[1]):
    print("Training chain node ", i)
    y_now = y_train.iloc[:, [i,]].copy()
    print("Shapes:\n X = {} \n Y = {}".format(X_feed.shape, y_now.shape))

    node = create_c1node(X_feed, y_now)
    node.fit(X_feed ,y_now, epochs=8, batch_size=64)
    print("Training of node {} complete\n\n".format(i))  
    #Adding node to chain 
    chain.append(node)
    
    # Now append y_now to X_feed
    X_feed = np.append(X_feed, y_now, axis=1)

t2 = time()

print("Time taken: ",(t2-t1))

Training chain node  0
Shapes:
 X = (33834, 10000) 
 Y = (33834, 1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/8
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 75ms/step - acc: 0.8443 - loss: 0.3945
Epoch 2/8
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 75ms/step - acc: 0.8820 - loss: 0.2706
Epoch 3/8
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 73ms/step - acc: 0.9041 - loss: 0.2246
Epoch 4/8
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 76ms/step - acc: 0.9274 - loss: 0.1768
Epoch 5/8
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 82ms/step - acc: 0.9532 - loss: 0.1199
Epoch 6/8
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 73ms/step - acc: 0.9718 - loss: 0.0776
Epoch 7/8
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 74ms/step - acc: 0.9796 - loss: 0.0577
Epoch 8/8
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 74ms/step - acc: 0.9859 - loss: 0.0402
Training of node 0 complete


Training chain node  1
Shapes:
 X = (33834

In [15]:
y_pred = []

X_feed = X_test_dense.copy()

i = 0
for node in chain:
    print("Getting op from chain node ",i)
    node_name = "Node" + str(i)
    print("Shapes:\n X = {}".format(X_feed.shape))
    output = node.predict(X_feed)
    output = output.round().astype(int)
    y_pred.append(output)
    X_feed = np.append(X_feed, output, axis=1)
    i+=1

Getting op from chain node  0
Shapes:
 X = (14501, 10000)
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step
Getting op from chain node  1
Shapes:
 X = (14501, 10001)
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step
Getting op from chain node  2
Shapes:
 X = (14501, 10002)
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step
Getting op from chain node  3
Shapes:
 X = (14501, 10003)
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step
Getting op from chain node  4
Shapes:
 X = (14501, 10004)
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step
Getting op from chain node  5
Shapes:
 X = (14501, 10005)
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step
Getting op from chain node  6
Shapes:
 X = (14501, 10006)
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step
Getting op from chain node  7
Shapes:
 X = (14501, 10007)
[1m45

In [16]:
predictions = np.array([0])
for i in y_pred:
    x = np.array(i)
    if predictions.shape == (1,):
        predictions = x
    else:
        predictions = np.append(predictions, x, axis=1)

In [17]:
from sklearn.metrics import hamming_loss, log_loss, f1_score, accuracy_score, jaccard_score, classification_report

print("Accuracy = ", accuracy_score(y_test, predictions))
print("\nJaccard Score (micro)= ", jaccard_score(y_test, predictions, average='micro'))
print("\nJaccard Score (macro)= ", jaccard_score(y_test, predictions, average='macro'))
print("\nHamming loss = ", hamming_loss(y_test, predictions))
#print("Log loss = ", log_loss(y_test, predictions))
print("\nF1 score (micro) = ", f1_score(y_test, predictions, average='micro'))
print("\nF1 score (macro) = ", f1_score(y_test, predictions, average='macro'))

ann_cc_ml_accuracy = multi_label_accuracy(y_test, predictions)
print(f'Multi Label Accuracy: {ann_cc_ml_accuracy:.4f}')  

Accuracy =  0.1230259982070202

Jaccard Score (micro)=  0.49687713353731006

Jaccard Score (macro)=  0.49505745145788216

Hamming loss =  0.137211226811944

F1 score (micro) =  0.663884994172002

F1 score (macro) =  0.6469201625002781
Multi Label Accuracy: 0.5151
