In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data.csv')

In [2]:
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [3]:
def text_lowercase(text):
    return text.lower()

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
    
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

def remove_bracket(text):
    return re.sub('\[[^]]*\]', '', text)
    
def remove_special_characters(text):
    text = text.replace('-', ' ')  
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r"\n+", " ", text)
    return text

def remove_placeholders(text):
    return re.sub(r"\[\*\*.*?\*\*\]", "", text) 


def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    word_tokens = [word for word in word_tokens if word not in stop_words]
    text = ' '.join(word_tokens) # rejoin tokens into a single string
    return text

In [4]:
def clean_text(text):
    text = text_lowercase(text)
    text = remove_placeholders(text)
    text = remove_bracket(text)
    text = remove_numbers(text)
    text = remove_special_characters(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text) 
    return text    

In [5]:
df['TEXT'] = df['TEXT'].apply(clean_text)

In [6]:
def multi_label_accuracy(y_true, y_pred):
    """
    Calculate multi-label accuracy.
    
    Args:
        y_true (pd.DataFrame or np.ndarray): True binary labels.
        y_pred (pd.DataFrame or np.ndarray): Predicted binary labels.
        
    Returns:
        float: Multi-label accuracy.
    """
    # Convert DataFrame to NumPy array if needed
    if isinstance(y_true, pd.DataFrame):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame):
        y_pred = y_pred.values
    
    n_samples = y_true.shape[0]
    accuracy_sum = 0.0
    
    for i in range(n_samples):
        true_set = set(np.where(y_true[i])[0])
        pred_set = set(np.where(y_pred[i])[0])
        
        intersection = len(true_set & pred_set)
        union = len(true_set | pred_set)
        
        if union == 0:
            accuracy_sum += 1.0
        else:
            accuracy_sum += intersection / union
    
    return accuracy_sum / n_samples

In [7]:
X = df['TEXT']
y = df.iloc[:, 3:]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

print("Number of Observations in the Data set:" , X.values.shape[0]) 
print("Number of Observations in Train set:", X_train.shape[0])
print("Number of Observations in Test set:" , X_test.shape[0])   

Number of Observations in the Data set: 48335
Number of Observations in Train set: 33834
Number of Observations in Test set: 14501


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=20000)

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [10]:
from sklearn.multioutput import ClassifierChain
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000, random_state=42)
chain = ClassifierChain(lr, random_state=11) 
chain.fit(X_train_vectorized, y_train)

In [11]:
y_pred = chain.predict(X_test_vectorized)

In [12]:
from sklearn.metrics import hamming_loss, f1_score, accuracy_score, jaccard_score

lr_jaccard_micro = jaccard_score(y_test, y_pred, average="micro")
lr_jaccard_macro = jaccard_score(y_test, y_pred, average="macro")

lr_hamming_loss = hamming_loss(y_test, y_pred)
lr_accuracy = accuracy_score(y_test, y_pred)

lr_f1_micro = f1_score(y_test, y_pred, average='micro')
lr_f1_macro = f1_score(y_test, y_pred, average='macro')

print("Jaccard Score (micro):", lr_jaccard_micro)
print("\nJaccard Score (macro):", lr_jaccard_macro)
print("\nAccuracy:", lr_accuracy)
print("\nHammilg Loss:", lr_hamming_loss)
print("\nF1 (micro):", lr_f1_micro)
print("\nF1 (macro):", lr_f1_macro)

Jaccard Score (micro): 0.46771669087268575

Jaccard Score (macro): 0.44351173209112604

Accuracy: 0.13219777946348527

Hammilg Loss: 0.1301599889662782

F1 (micro): 0.6373392001076003

F1 (macro): 0.5896081454855897


In [13]:
ml_accuracy = multi_label_accuracy(y_test, y_pred)
print(f'Multi Label Accuracy: {ml_accuracy:.4f}')  

Multi Label Accuracy: 0.4782
