In [72]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout,BatchNormalization
from sklearn.preprocessing import MultiLabelBinarizer


In [47]:
df= pd.read_csv("../dataset-1.csv")

In [48]:
df.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [49]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [50]:
stopWords = stopwords.words('english')

In [51]:
def process(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word not in stopWords]
    return " ".join(words)

In [52]:
df["processedText"] = df["TITLE"] + " " + df["ABSTRACT"]
df["processedText"] = df["processedText"].map(process)

In [53]:
X = df["processedText"]
y = df.iloc[:, 3:-1]
print(X)
# print(y)
y=y.values
print(y)

0        reconstructing subject-specific effect maps pr...
1        rotation invariance neural network rotation in...
2        spherical polyharmonics poisson kernels polyha...
3        finite element approximation stochastic maxwel...
4        comparative study discrete wavelet transforms ...
                               ...                        
20967    contemporary machine learning : guide practiti...
20968    uniform diamond coatings wc-co hard alloy cutt...
20969    analysing soccer games clustering conceptors p...
20970    efficient simulation left-tail sum correlated ...
20971    optional stopping problem bayesians recently ,...
Name: processedText, Length: 20972, dtype: object
[[1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [0 0 1 0 0 0]
 ...
 [1 0 0 0 0 0]
 [0 0 1 1 0 0]
 [0 0 1 1 0 0]]


In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [None]:
# print(X_train)
max_words = 5000
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
print(X_train)

11226    relative merits phononics vs. plasmonics : ene...
9388     2d metamaterial auxetic out-of-plane behavior ...
19085    fast matrix inversion determinant computation ...
20450    neuronal models neuronal dynamics image proces...
20722    memory augmented control networks planning pro...
                               ...                        
500      compact design velocity-map imaging energetic ...
12077    life `` matrix '' : human mobility patterns cy...
15277    inverse antiplane problem $ n $ uniformly stre...
4517     floquet analysis kuznetsov -- breathers : path...
5925     numerical study f-model domain-wall boundaries...
Name: processedText, Length: 16777, dtype: object


In [56]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)


In [57]:
max_length = 148
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')
print(X_train_padded)


[[ 629 5181    1 ...  316 9431  159]
 [ 855 4920    1 ...    0    0    0]
 [ 409  120 2222 ...    0    0    0]
 ...
 [ 704    1   16 ...    0    0    0]
 [4650   28    1 ...    0    0    0]
 [ 139   26  109 ...    0    0    0]]


In [73]:
model = Sequential([
    # Word Embeddings
    Embedding(input_dim=max_words, output_dim=128, input_length=max_length),

    # First LSTM Layer with Batch Normalization
    LSTM(128, return_sequences=True),
    BatchNormalization(),
    Dropout(0.3),

    # Second LSTM Layer
    LSTM(64, return_sequences=False),
    BatchNormalization(),
    Dropout(0.3),

    # Fully Connected Layers
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu'),
    Dropout(0.3),

    # Output Layer for Multi-label Classification
    Dense(y.shape[1], activation='sigmoid')
])


In [74]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [75]:
model.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=10, batch_size=32)


Epoch 1/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 175ms/step - accuracy: 0.2881 - loss: 0.5603 - val_accuracy: 0.2112 - val_loss: 0.5021
Epoch 2/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 214ms/step - accuracy: 0.4586 - loss: 0.4156 - val_accuracy: 0.2741 - val_loss: 0.6150
Epoch 3/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 213ms/step - accuracy: 0.7577 - loss: 0.2709 - val_accuracy: 0.5216 - val_loss: 0.4541
Epoch 4/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 222ms/step - accuracy: 0.7681 - loss: 0.2220 - val_accuracy: 0.6255 - val_loss: 0.9932
Epoch 5/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 214ms/step - accuracy: 0.7764 - loss: 0.1868 - val_accuracy: 0.4467 - val_loss: 0.6404
Epoch 6/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 203ms/step - accuracy: 0.7880 - loss: 0.1671 - val_accuracy: 0.3044 - val_loss: 0.4460
Epoch

<keras.src.callbacks.history.History at 0x19e87b63890>

In [None]:
from sklearn.metrics import accuracy_score, hamming_loss, f1_score, jaccard_score

threshold = 0.5  
y_pred_prob = model.predict(X_test_padded)

y_pred = (y_pred_prob > threshold).astype(int)


# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
hamming = hamming_loss(y_test, y_pred)
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')
jaccard = jaccard_score(y_test, y_pred, average='samples')

# Print results
print(f"Subset Accuracy: {accuracy:.4f}")
print(f"Hamming Loss: {hamming:.4f}")
print(f"F1 Score (Micro): {f1_micro:.4f}")
print(f"F1 Score (Macro): {f1_macro:.4f}")
print(f"Jaccard Similarity Score: {jaccard:.4f}")


[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 33ms/step
F1 Score at threshold 0.4: 0.7864
F1 Score at threshold 0.5: 0.7837
F1 Score at threshold 0.6: 0.7795
Subset Accuracy: 0.6155
Hamming Loss: 0.0903
F1 Score (Micro): 0.7795
F1 Score (Macro): 0.5368
Jaccard Similarity Score: 0.7407


In [62]:
import pandas as pd

# Make predictions
y_pred_prob = model.predict(X_test_padded)
print(y_pred_prob)
# Convert probabilities to binary labels using a threshold (e.g., 0.5)
threshold = 0.5
y_pred = (y_pred_prob >= threshold).astype(int)

# Get topic names from DataFrame columns
topic_names = df.columns[3:-1]  # Fix: Use original column names

# Convert predicted labels into human-readable topic names
predicted_topics = []
for pred in y_pred:
    topics = [topic_names[i] for i in range(len(pred)) if pred[i] == 1]
    predicted_topics.append(topics)

# Display example predictions
for i in range(5):
    print(f"Paper {i+1} Predicted Topics: {predicted_topics[i]}")


[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 85ms/step
[[9.9965912e-01 9.6541150e-03 7.2122458e-03 9.1990991e-04 1.6118425e-04
  3.8834518e-05]
 [5.0098682e-04 9.9956191e-01 3.6089562e-04 1.1473326e-03 3.5452086e-04
  3.7887019e-06]
 [1.5766552e-04 9.9978453e-01 1.2740152e-03 4.7753623e-04 7.4151576e-05
  3.0351089e-06]
 ...
 [9.9264395e-01 2.9766096e-02 5.6273453e-03 2.2353631e-02 4.6986556e-03
  5.1594124e-04]
 [9.1548592e-02 9.9566036e-01 1.6158741e-04 2.5023371e-03 2.4060970e-03
  1.0314655e-05]
 [7.1497113e-02 6.5226930e-01 3.2345885e-01 7.0801094e-02 6.8223417e-02
  1.2342438e-01]]
Paper 1 Predicted Topics: ['Computer Science']
Paper 2 Predicted Topics: ['Physics']
Paper 3 Predicted Topics: ['Physics']
Paper 4 Predicted Topics: ['Mathematics']
Paper 5 Predicted Topics: ['Computer Science']


In [63]:
print(X_test_padded)

[[ 147 3419 1422 ...    0    0    0]
 [1740  645 2115 ...    0    0    0]
 [1472  308  527 ...    0    0    0]
 ...
 [ 977   28  724 ...    0    0    0]
 [3499 2454  990 ...    0    0    0]
 [ 571   43  295 ...    0    0    0]]
