# Dataset 1

In [276]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, hamming_loss, f1_score, jaccard_score
import re

In [277]:
df= pd.read_csv("./dataset-1.csv")

In [278]:
df.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [279]:
df.isna().sum()

ID                      0
TITLE                   0
ABSTRACT                0
Computer Science        0
Physics                 0
Mathematics             0
Statistics              0
Quantitative Biology    0
Quantitative Finance    0
dtype: int64

In [280]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [281]:
stopWords = stopwords.words('english')

In [282]:
def process(text):
    text = re.sub(r'\d+', '', text) 
    words = word_tokenize(text.lower())  
    words = [word for word in words if word not in stopWords]  
    return " ".join(words)

In [283]:
X= df[['TITLE','ABSTRACT']]
X.fillna("")
X.head()


Unnamed: 0,TITLE,ABSTRACT
0,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...
1,Rotation Invariance Neural Network,Rotation invariance and translation invarian...
2,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...
3,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...
4,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...


In [284]:
y=df.iloc[:, 3:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
X_train=X_train.map(process)

In [285]:
X.head()

Unnamed: 0,TITLE,ABSTRACT
0,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...
1,Rotation Invariance Neural Network,Rotation invariance and translation invarian...
2,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...
3,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...
4,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...


### Using Tf-IDF and combining it with Random Forest Classifier along with OneVsRestClassifier for multi label calssification

In [286]:
titletf = TfidfVectorizer(max_features=1000)
abstracttf= TfidfVectorizer(max_features=1000)
title_tfidf = titletf.fit_transform(X_train["TITLE"])
abstract_tfidf = abstracttf.fit_transform(X_train["ABSTRACT"])
X_train_tfidf = np.column_stack((title_tfidf.toarray(), abstract_tfidf.toarray()))

In [287]:
rf = OneVsRestClassifier(RandomForestClassifier(
    n_estimators=200,   
    max_depth=14,       
    min_samples_split=5,  
    min_samples_leaf=3,  
    max_features="sqrt", 
    class_weight="balanced",  
    n_jobs=-1,
    random_state=42,
    verbose=1
))

In [288]:
rf.fit(X_train_tfidf, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   10.7s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   10.7s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   11.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 1

In [289]:
def predict_df(df,model):
    df=df.map(process)  
    title_tfidf = titletf.transform(df["TITLE"])
    abstract_tfidf = abstracttf.transform(df["ABSTRACT"])
    tfidf = np.column_stack((title_tfidf.toarray(), abstract_tfidf.toarray()))
    prediction = model.predict(tfidf)
    return prediction

In [290]:
def predict(title,abstract,model):
    categories = ["Computer Science", "Physics", "Mathematics", "Statistics", "Quantitative Biology", "Quantitative Finance"]
    title=process(title)
    abstract=process(abstract)
    title_tfidf = titletf.transform([title])  
    abstract_tfidf = abstracttf.transform([abstract])  
    tfidf = np.column_stack((title_tfidf.toarray(), abstract_tfidf.toarray()))
    prediction = model.predict(tfidf)
    predicted_labels = [categories[i] for i in range(len(categories)) if prediction[0, i] == 1]
    return predicted_labels

In [291]:
y_train_pred= rf.predict(X_train_tfidf)
y_pred=predict_df(X_test,rf)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    0.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      |

In [292]:
# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
hamming = hamming_loss(y_test, y_pred)
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')
jaccard = jaccard_score(y_test, y_pred, average='samples')
train_accuracy= accuracy_score(y_train_pred,y_train)
# Print results
print(f"Subset Accuracy: {accuracy:.4f}")
print(f"Subset Train Accuracy: {train_accuracy:.4f}")
print(f"Hamming Loss: {hamming:.4f}")
print(f"F1 Score (Micro): {f1_micro:.4f}")
print(f"F1 Score (Macro): {f1_macro:.4f}")
print(f"Jaccard Similarity Score: {jaccard:.4f}")

Subset Accuracy: 0.5426
Subset Train Accuracy: 0.6231
Hamming Loss: 0.1059
F1 Score (Micro): 0.7570
F1 Score (Macro): 0.5545
Jaccard Similarity Score: 0.6953


### Using TF-IDF with Neural Network for Classification

In [294]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate
from tensorflow.keras.callbacks import EarlyStopping

In [295]:
early_stopping = EarlyStopping(
    monitor="val_loss",  
    patience=5,           
    restore_best_weights=True  
)
model = Sequential([
    Input(shape=(X_train_tfidf.shape[1],)),
    Dense(256, activation="relu"),
    Dropout(0.8),
    Dense(512, activation="relu"),  
    Dropout(0.8),
    Dense(y.shape[1], activation="sigmoid") 
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

model.fit(X_train_tfidf, y_train, epochs=50, batch_size=32, validation_split=0.3,callbacks=[early_stopping])

Epoch 1/50
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.4723 - loss: 0.4479 - val_accuracy: 0.7650 - val_loss: 0.2231
Epoch 2/50
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.7461 - loss: 0.2408 - val_accuracy: 0.7569 - val_loss: 0.2021
Epoch 3/50
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.7554 - loss: 0.2112 - val_accuracy: 0.7499 - val_loss: 0.1996
Epoch 4/50
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.7646 - loss: 0.1982 - val_accuracy: 0.7545 - val_loss: 0.2017
Epoch 5/50
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.7752 - loss: 0.1832 - val_accuracy: 0.7559 - val_loss: 0.2039
Epoch 6/50
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.7816 - loss: 0.1697 - val_accuracy: 0.7390 - val_loss: 0.2044
Epoch 7/50
[1m367/367

<keras.src.callbacks.history.History at 0x274b61c2ae0>

In [296]:
def predict_df(df,model):
    df=df.map(process)  
    title_tfidf = titletf.transform(df["TITLE"])
    abstract_tfidf = abstracttf.transform(df["ABSTRACT"])
    tfidf = np.column_stack((title_tfidf.toarray(), abstract_tfidf.toarray()))
    prediction = model.predict(tfidf)
    prediction = (prediction > 0.5).astype(int)
    return prediction
def predict(title,abstract,model):
    categories = ["Computer Science", "Physics", "Mathematics", "Statistics", "Quantitative Biology", "Quantitative Finance"]
    title=process(title)
    abstract=process(abstract)
    title_tfidf = titletf.transform([title])  
    abstract_tfidf = abstracttf.transform([abstract])  
    tfidf = np.column_stack((title_tfidf.toarray(), abstract_tfidf.toarray()))
    prediction = model.predict(tfidf)
    prediction = (prediction > 0.5).astype(int)
    predicted_labels = [categories[i] for i in range(len(categories)) if prediction[0, i] == 1]
    return predicted_labels

In [297]:
y_pred=predict_df(X_test,model)

[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [298]:
# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
hamming = hamming_loss(y_test, y_pred)
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')
jaccard = jaccard_score(y_test, y_pred, average='samples')

# Print results
print(f"Subset Accuracy: {accuracy:.4f}")
print(f"Hamming Loss: {hamming:.4f}")
print(f"F1 Score (Micro): {f1_micro:.4f}")
print(f"F1 Score (Macro): {f1_macro:.4f}")
print(f"Jaccard Similarity Score: {jaccard:.4f}")

Subset Accuracy: 0.6451
Hamming Loss: 0.0849
F1 Score (Micro): 0.7941
F1 Score (Macro): 0.5374
Jaccard Similarity Score: 0.7591


### Using LSTM for Classification

In [300]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout,BatchNormalization,GlobalAveragePooling1D,Bidirectional
from sklearn.preprocessing import MultiLabelBinarizer

In [301]:
df["processedText"] = df["TITLE"] + " " + df["ABSTRACT"]
df["processedText"] = df["processedText"].map(process)

In [302]:
X = df["processedText"]
y = df.iloc[:, 3:-1]
y=y.values

In [303]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [304]:
max_words = 5000
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [305]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [306]:
max_length = 148
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

In [307]:
early_stopping = EarlyStopping(
    monitor="val_loss",  
    patience=5,           
    restore_best_weights=True  
)

In [326]:
model = Sequential([
    # Word Embeddings
    Embedding(input_dim=max_words, output_dim=128),

    # First LSTM Layer with Batch Normalization
    BatchNormalization(),
    LSTM(128, return_sequences=True),
    LSTM(64, return_sequences=True),
    GlobalAveragePooling1D(),
    Dense(128, activation="relu"),
    Dropout(0.5),
    Dense(64, activation="relu"),  
    Dropout(0.5),
    # Output Layer for Multi-label Classification
    Dense(y.shape[1], activation='sigmoid')
])

In [327]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train_padded, y_train, validation_split=0.3, epochs=10, batch_size=32,callbacks=[early_stopping])

Epoch 1/10
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 238ms/step - accuracy: 0.4825 - loss: 0.4368 - val_accuracy: 0.7414 - val_loss: 0.2369
Epoch 2/10
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 241ms/step - accuracy: 0.7647 - loss: 0.2330 - val_accuracy: 0.7356 - val_loss: 0.1996
Epoch 3/10
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 220ms/step - accuracy: 0.7839 - loss: 0.1781 - val_accuracy: 0.7410 - val_loss: 0.2024
Epoch 4/10
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 264ms/step - accuracy: 0.7960 - loss: 0.1496 - val_accuracy: 0.7187 - val_loss: 0.2099
Epoch 5/10
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 252ms/step - accuracy: 0.8151 - loss: 0.1253 - val_accuracy: 0.7322 - val_loss: 0.2485
Epoch 6/10
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 253ms/step - accuracy: 0.8232 - loss: 0.1107 - val_accuracy: 0.7267 - val_loss: 0.2587
Epoch 7/10

In [None]:
threshold = 0.5  
y_pred_prob = model.predict(X_test_padded)

y_pred = (y_pred_prob > threshold).astype(int)


# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
hamming = hamming_loss(y_test, y_pred)
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')
jaccard = jaccard_score(y_test, y_pred, average='samples')

# Print results
print(f"Subset Accuracy: {accuracy:.4f}")
print(f"Hamming Loss: {hamming:.4f}")
print(f"F1 Score (Micro): {f1_micro:.4f}")
print(f"F1 Score (Macro): {f1_macro:.4f}")
print(f"Jaccard Similarity Score: {jaccard:.4f}")

### Using GRUs for Classification

In [322]:
from tensorflow.keras.layers import GRU
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128),
    BatchNormalization(),
    GRU(128, return_sequences=True),
    GRU(64, return_sequences=True),
    GlobalAveragePooling1D(),
    Dense(128, activation="relu"),
    Dropout(0.3),
    Dense(64, activation="relu"),  
    Dropout(0.3),
    # Output Layer for Multi-label Classification
    Dense(y.shape[1], activation='sigmoid')
])

In [323]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [324]:
model.fit(X_train_padded, y_train, validation_split=0.3, epochs=10, batch_size=32,callbacks=[early_stopping])

Epoch 1/10
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 201ms/step - accuracy: 0.5657 - loss: 0.3799 - val_accuracy: 0.7569 - val_loss: 0.2065
Epoch 2/10
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 186ms/step - accuracy: 0.7801 - loss: 0.1799 - val_accuracy: 0.7499 - val_loss: 0.1909
Epoch 3/10
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 181ms/step - accuracy: 0.8081 - loss: 0.1362 - val_accuracy: 0.7346 - val_loss: 0.2136
Epoch 4/10
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 205ms/step - accuracy: 0.8218 - loss: 0.1109 - val_accuracy: 0.7356 - val_loss: 0.2351
Epoch 5/10
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 241ms/step - accuracy: 0.8378 - loss: 0.0865 - val_accuracy: 0.7171 - val_loss: 0.2739
Epoch 6/10
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 218ms/step - accuracy: 0.8336 - loss: 0.0665 - val_accuracy: 0.7261 - val_loss: 0.3027
Epoch 7/10

<keras.src.callbacks.history.History at 0x274b7124e00>

In [325]:
threshold = 0.5  
y_pred_prob = model.predict(X_test_padded)

y_pred = (y_pred_prob > threshold).astype(int)


# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
hamming = hamming_loss(y_test, y_pred)
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')
jaccard = jaccard_score(y_test, y_pred, average='samples')

# Print results
print(f"Subset Accuracy: {accuracy:.4f}")
print(f"Hamming Loss: {hamming:.4f}")
print(f"F1 Score (Micro): {f1_micro:.4f}")
print(f"F1 Score (Macro): {f1_macro:.4f}")
print(f"Jaccard Similarity Score: {jaccard:.4f}")

[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 71ms/step
Subset Accuracy: 0.6551
Hamming Loss: 0.0799
F1 Score (Micro): 0.8055
F1 Score (Macro): 0.5592
Jaccard Similarity Score: 0.7665


# Dataset 2

In [169]:
df2= pd.read_csv("./dataset-2.csv")

In [170]:
df2.head()

Unnamed: 0,Title,Content,Domain
0,A Few Best Men (2011) - IMDb,A Few Best Men 2011 Not Rated 1h 37m IMDb RATI...,Entertainment
1,A Good Day to Die Hard (2013) - IMDb,A Good Day to Die Hard 2013 R 1h 38m IMDb RATI...,Entertainment
2,A Knight's Tale (2001) - IMDb,A Knights Tale 2001 PG13 2h 12m IMDb RATING 70...,Entertainment
3,Abraham Lincoln: Vampire Hunter (2012) - IMDb,Abraham Lincoln Vampire Hunter 2012 R 1h 45m I...,Entertainment
4,After the Sunset (2004) - IMDb,After the Sunset 2004 PG13 1h 37m IMDb RATING ...,Entertainment


In [171]:
df2.isna().sum()

Title      0
Content    0
Domain     0
dtype: int64

In [172]:
X2= df2[['Title','Content']]
X2.fillna("")
X2.head()

Unnamed: 0,Title,Content
0,A Few Best Men (2011) - IMDb,A Few Best Men 2011 Not Rated 1h 37m IMDb RATI...
1,A Good Day to Die Hard (2013) - IMDb,A Good Day to Die Hard 2013 R 1h 38m IMDb RATI...
2,A Knight's Tale (2001) - IMDb,A Knights Tale 2001 PG13 2h 12m IMDb RATING 70...
3,Abraham Lincoln: Vampire Hunter (2012) - IMDb,Abraham Lincoln Vampire Hunter 2012 R 1h 45m I...
4,After the Sunset (2004) - IMDb,After the Sunset 2004 PG13 1h 37m IMDb RATING ...


In [173]:
y2=df2['Domain']
y2.head()


0    Entertainment
1    Entertainment
2    Entertainment
3    Entertainment
4    Entertainment
Name: Domain, dtype: object

In [174]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, random_state=30)
X_train=X_train.map(process)

### Using Tf-IDF and combining it with Random Forest Classifier along with OneVsRestClassifier for multi label calssification

In [175]:
titletf = TfidfVectorizer(max_features=1000)
content= TfidfVectorizer(max_features=1000)
title_tfidf = titletf.fit_transform(X_train["Title"])
content_tfidf = abstracttf.fit_transform(X_train["Content"])
X_train_tfidf = np.column_stack((title_tfidf.toarray(), content_tfidf.toarray()))

In [184]:
rf = RandomForestClassifier(
    n_estimators=200,   
    max_depth=14,       
    min_samples_split=5,  
    min_samples_leaf=3,  
    max_features="sqrt", 
    class_weight="balanced",  
    n_jobs=-1,
    random_state=42,
    verbose=1
)

In [185]:
rf.fit(X_train_tfidf, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    1.2s finished


In [180]:
def predict_df(df,model):
    df=df.map(process)  
    title_tfidf = titletf.transform(df["Title"])
    abstract_tfidf = abstracttf.transform(df["Content"])
    tfidf = np.column_stack((title_tfidf.toarray(), abstract_tfidf.toarray()))
    prediction = model.predict(tfidf)
    return prediction

In [186]:
y_pred=predict_df(X_test,rf)
y_train_pred= rf.predict(X_train_tfidf)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    0.0s finished


In [188]:
# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
hamming = hamming_loss(y_test, y_pred)
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')
train_accuracy= accuracy_score(y_train_pred,y_train)
# Print results
print(f"Subset Accuracy: {accuracy:.4f}")
print(f"Subset Train Accuracy: {train_accuracy:.4f}")
print(f"Hamming Loss: {hamming:.4f}")
print(f"F1 Score (Micro): {f1_micro:.4f}")
print(f"F1 Score (Macro): {f1_macro:.4f}")
print(f"Jaccard Similarity Score: {jaccard:.4f}")

Subset Accuracy: 0.9987
Subset Train Accuracy: 0.9962
Hamming Loss: 0.0013
F1 Score (Micro): 0.9987
F1 Score (Macro): 0.9984
Jaccard Similarity Score: 0.6930


### Using TF-IDF with Neural Network for Classification

In [191]:
print(y.nunique(),y_train.nunique())

5 5


In [198]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
y_train_enc = encoder.fit_transform(y_train.values.reshape(-1, 1))
num_classes= y_train.nunique()
early_stopping = EarlyStopping(
    monitor="val_loss",  
    patience=5,           
    restore_best_weights=True  
)
model = Sequential([
    Input(shape=(X_train_tfidf.shape[1],)),
    Dense(256, activation="relu"),
    Dropout(0.8),
    Dense(512, activation="relu"),  
    Dropout(0.8),
    Dense(num_classes, activation="softmax") 
])

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

model.fit(X_train_tfidf, y_train_enc, epochs=50, batch_size=32, validation_split=0.3,callbacks=[early_stopping])

Epoch 1/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.3450 - loss: 1.4616 - val_accuracy: 0.8685 - val_loss: 0.6487
Epoch 2/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.8756 - loss: 0.5022 - val_accuracy: 0.9979 - val_loss: 0.0457
Epoch 3/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9864 - loss: 0.0962 - val_accuracy: 0.9989 - val_loss: 0.0118
Epoch 4/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9938 - loss: 0.0360 - val_accuracy: 0.9989 - val_loss: 0.0077
Epoch 5/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9959 - loss: 0.0222 - val_accuracy: 0.9989 - val_loss: 0.0063
Epoch 6/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.9962 - loss: 0.0202 - val_accuracy: 0.9989 - val_loss: 0.0064
Epoch 7/50
[1m69/69[0m [32m━━━━

<keras.src.callbacks.history.History at 0x2747ba57020>

In [219]:
def predict_df(df,model):
    df=df.map(process)  
    title_tfidf = titletf.transform(df["Title"])
    abstract_tfidf = abstracttf.transform(df["Content"])
    tfidf = np.column_stack((title_tfidf.toarray(), abstract_tfidf.toarray()))
    prediction = model.predict(tfidf)
    predicted_indices = prediction.argmax(axis=1)
    prediction_one_hot = np.zeros((predicted_indices.shape[0], num_classes)) 
    prediction_one_hot[np.arange(predicted_indices.shape[0]), predicted_indices] = 1
    
    y_pred_labels = encoder.inverse_transform(prediction_one_hot)
    
    return y_pred_labels

In [220]:
y_pred=predict_df(X_test,model)

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 


In [223]:
accuracy = accuracy_score(y_test, y_pred)
hamming = hamming_loss(y_test, y_pred)
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')

# Print results
print(f"Subset Accuracy: {accuracy:.4f}")
print(f"Hamming Loss: {hamming:.4f}")
print(f"F1 Score (Micro): {f1_micro:.4f}")
print(f"F1 Score (Macro): {f1_macro:.4f}")

Subset Accuracy: 0.9987
Hamming Loss: 0.0013
F1 Score (Micro): 0.9987
F1 Score (Macro): 0.9991


### Using LSTM for Classification

In [241]:
df2["processedText"] = df2["Title"] + " " + df2["Content"]
df2["processedText"] = df2["processedText"].map(process)

In [242]:
X = df2["processedText"]
y=df2['Domain']
y=y

In [243]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [244]:
max_words = 5000
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [245]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [246]:
max_length = 148
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

In [263]:
num_classes= y.nunique()
early_stopping = EarlyStopping(
    monitor="val_loss",  
    patience=3,           
    restore_best_weights=True  
)
model = Sequential([
    # Word Embeddings
    Embedding(input_dim=max_words, output_dim=128),

    # First LSTM Layer with Batch Normalization
    BatchNormalization(),
    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalAveragePooling1D(),
    Dense(32, activation="relu"),
    Dropout(0.5),
    Dense(16, activation="relu"),  
    Dropout(0.5),
    # Output Layer for Multi-label Classification
    Dense(num_classes, activation='softmax')
])

In [264]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [265]:
encoder = OneHotEncoder(sparse_output=False)
y_train_enc = encoder.fit_transform(y_train.values.reshape(-1, 1))

In [266]:
model.fit(X_train_padded, y_train_enc, validation_split=0.3, epochs=50, batch_size=32,callbacks=[early_stopping])

Epoch 1/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 123ms/step - accuracy: 0.4056 - loss: 1.3644 - val_accuracy: 0.5027 - val_loss: 1.3043
Epoch 2/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 124ms/step - accuracy: 0.7454 - loss: 0.6510 - val_accuracy: 0.8112 - val_loss: 0.5644
Epoch 3/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 117ms/step - accuracy: 0.8431 - loss: 0.4020 - val_accuracy: 0.8643 - val_loss: 0.4121
Epoch 4/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 128ms/step - accuracy: 0.8412 - loss: 0.3631 - val_accuracy: 0.9629 - val_loss: 0.2072
Epoch 5/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 150ms/step - accuracy: 0.8717 - loss: 0.3063 - val_accuracy: 0.9703 - val_loss: 0.1721
Epoch 6/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 200ms/step - accuracy: 0.8927 - loss: 0.2436 - val_accuracy: 0.8791 - val_loss: 0.3955
Epoch 7/50
[1m69/69[0m 

<keras.src.callbacks.history.History at 0x2749ecda120>

In [267]:
y_pred= model.predict(X_test_padded)
predicted_indices = y_pred.argmax(axis=1)
prediction_one_hot = np.zeros((predicted_indices.shape[0], num_classes)) 
prediction_one_hot[np.arange(predicted_indices.shape[0]), predicted_indices] = 1
y_pred_labels = encoder.inverse_transform(prediction_one_hot)

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step  


In [269]:
accuracy = accuracy_score(y_test, y_pred_labels)
hamming = hamming_loss(y_test, y_pred_labels)
f1_micro = f1_score(y_test, y_pred_labels, average='micro')
f1_macro = f1_score(y_test, y_pred_labels, average='macro')

print(f"Subset Accuracy: {accuracy:.4f}")
print(f"Hamming Loss: {hamming:.4f}")
print(f"F1 Score (Micro): {f1_micro:.4f}")
print(f"F1 Score (Macro): {f1_macro:.4f}")

Subset Accuracy: 0.9911
Hamming Loss: 0.0089
F1 Score (Micro): 0.9911
F1 Score (Macro): 0.9918


### Using GRUs for Classification

In [270]:
from tensorflow.keras.layers import GRU
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128),
    BatchNormalization(),
    Bidirectional(GRU(64, return_sequences=True)),
    GlobalAveragePooling1D(),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(32, activation="relu"),  
    Dropout(0.3),
    # Output Layer for Multi-label Classification
    Dense(num_classes, activation='softmax')
])

In [271]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [272]:
model.fit(X_train_padded, y_train_enc, validation_split=0.3, epochs=50, batch_size=32,callbacks=[early_stopping])

Epoch 1/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 118ms/step - accuracy: 0.6495 - loss: 1.0394 - val_accuracy: 0.8685 - val_loss: 0.7538
Epoch 2/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 111ms/step - accuracy: 0.9771 - loss: 0.0951 - val_accuracy: 0.9883 - val_loss: 0.2489
Epoch 3/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 112ms/step - accuracy: 0.9882 - loss: 0.0379 - val_accuracy: 0.9756 - val_loss: 0.1107
Epoch 4/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 102ms/step - accuracy: 0.9994 - loss: 0.0134 - val_accuracy: 0.9958 - val_loss: 0.0307
Epoch 5/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 122ms/step - accuracy: 0.9980 - loss: 0.0077 - val_accuracy: 0.9958 - val_loss: 0.0198
Epoch 6/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 85ms/step - accuracy: 0.9994 - loss: 0.0059 - val_accuracy: 0.9968 - val_loss: 0.0070
Epoch 7/50
[1m69/69[0m [

<keras.src.callbacks.history.History at 0x274a6880950>

In [273]:
y_pred= model.predict(X_test_padded)
predicted_indices = y_pred.argmax(axis=1)
prediction_one_hot = np.zeros((predicted_indices.shape[0], num_classes)) 
prediction_one_hot[np.arange(predicted_indices.shape[0]), predicted_indices] = 1
y_pred_labels = encoder.inverse_transform(prediction_one_hot)

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step 


In [274]:
accuracy = accuracy_score(y_test, y_pred_labels)
hamming = hamming_loss(y_test, y_pred_labels)
f1_micro = f1_score(y_test, y_pred_labels, average='micro')
f1_macro = f1_score(y_test, y_pred_labels, average='macro')

print(f"Subset Accuracy: {accuracy:.4f}")
print(f"Hamming Loss: {hamming:.4f}")
print(f"F1 Score (Micro): {f1_micro:.4f}")
print(f"F1 Score (Macro): {f1_macro:.4f}")

Subset Accuracy: 0.9936
Hamming Loss: 0.0064
F1 Score (Micro): 0.9936
F1 Score (Macro): 0.9940
