In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df= pd.read_csv("./dataset-1.csv")

In [3]:
df.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
stopWords = stopwords.words('english')

In [6]:
def process(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word not in stopWords]
    return " ".join(words)

In [7]:
X= df[['TITLE','ABSTRACT']]
X.head()


Unnamed: 0,TITLE,ABSTRACT
0,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...
1,Rotation Invariance Neural Network,Rotation invariance and translation invarian...
2,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...
3,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...
4,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...


In [8]:
X=X.map(process)
y=df.iloc[:, 3:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [9]:
X.head()

Unnamed: 0,TITLE,ABSTRACT
0,reconstructing subject-specific effect maps,predictive models allow subject-specific infer...
1,rotation invariance neural network,rotation invariance translation invariance gre...
2,spherical polyharmonics poisson kernels polyha...,introduce develop notion spherical polyharmoni...
3,finite element approximation stochastic maxwel...,stochastic landau -- lifshitz -- gilbert ( llg...
4,comparative study discrete wavelet transforms ...,fourier-transform infra-red ( ftir ) spectra s...


### Using Tf-IDF and combining it with Random Forest Classifier along with OneVsRestClassifier for multi label calssification

In [10]:
titletf = TfidfVectorizer(max_features=1000)
abstracttf= TfidfVectorizer(max_features=1000)
title_tfidf = titletf.fit_transform(X_train["TITLE"])
abstract_tfidf = abstracttf.fit_transform(X_train["ABSTRACT"])
X_train_tfidf = np.column_stack((title_tfidf.toarray(), abstract_tfidf.toarray()))

In [34]:
rf = OneVsRestClassifier(RandomForestClassifier(
    n_estimators=100,   
    max_depth=10,       
    min_samples_split=5,  
    min_samples_leaf=3,  
    max_features="sqrt", 
    class_weight="balanced",  
    n_jobs=-1,
    random_state=42,
    verbose=1
))



In [35]:
rf.fit(X_train_tfidf, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.6s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.9s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.4s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.4s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[

In [36]:
def predict_df(df,model):
    df=df.map(process)  
    title_tfidf = titletf.transform(df["TITLE"])
    abstract_tfidf = abstracttf.transform(df["ABSTRACT"])
    tfidf = np.column_stack((title_tfidf.toarray(), abstract_tfidf.toarray()))
    prediction = model.predict(tfidf)
    return prediction

In [37]:
def predict(title,abstract,model):
    categories = ["Computer Science", "Physics", "Mathematics", "Statistics", "Quantitative Biology", "Quantitative Finance"]
    title=process(title)
    abstract=process(abstract)
    title_tfidf = titletf.transform([title])  
    abstract_tfidf = abstracttf.transform([abstract])  
    tfidf = np.column_stack((title_tfidf.toarray(), abstract_tfidf.toarray()))
    prediction = model.predict(tfidf)
    predicted_labels = [categories[i] for i in range(len(categories)) if prediction[0, i] == 1]
    return predicted_labels

In [38]:
y_train_pred= predict_df(X_train,rf)
y_pred=predict_df(X_test,rf)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_job

In [39]:
from sklearn.metrics import f1_score,accuracy_score
f1 = f1_score(y_test, y_pred, average='samples')
f1_train = f1_score(y_train, y_train_pred, average='samples')
print("F1 Score:", f1 , f1_train)

F1 Score: 0.7352244735796583 0.7794222248713517


In [40]:
subset_acc = accuracy_score(y_test, y_pred)
subset_acc_train = accuracy_score(y_train, y_train_pred)
print("Subset Accuracy:", subset_acc ,subset_acc_train)

Subset Accuracy: 0.5187127532777116 0.5726291947308816


In [20]:
title = "Deep Learning for Image Classification"
abstract="This paper explores deep learning techniques applied to image recognition tasks, focusing on advancements in convolutional neural networks (CNNs), transfer learning, and self-supervised learning. We analyze the impact of model architectures such as ResNet, Vision Transformers (ViTs), and EfficientNet in improving classification accuracy across large-scale datasets like ImageNet. Additionally, we discuss the role of data augmentation, hyperparameter tuning, and computational efficiency in optimizing performance. The paper also reviews recent breakthroughs in multimodal learning, integrating textual and visual information to enhance recognition capabilities. Finally, we highlight key challenges, such as interpretability, adversarial robustness, and real-time deployment in edge devices"
predicted_label=predict(title,abstract,rf)
print("Predicted Label:", predicted_label)

(1, 6)
Predicted Label: ['Computer Science', 'Statistics']


### Using TF-IDF with Neural Network for Classification

In [70]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate
from tensorflow.keras.callbacks import EarlyStopping

In [117]:
early_stopping = EarlyStopping(
    monitor="val_loss",  
    patience=5,           
    restore_best_weights=True  
)
model = Sequential([
    Input(shape=(X_train_tfidf.shape[1],)),
    Dense(256, activation="relu"),
    Dropout(0.8),
    Dense(512, activation="relu"),  
    Dropout(0.8),
    Dense(y.shape[1], activation="sigmoid") 
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

model.fit(X_train_tfidf, y_train, epochs=50, batch_size=32, validation_split=0.2,callbacks=[early_stopping])

Epoch 1/50
[1m420/420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 13ms/step - accuracy: 0.4837 - loss: 0.4381 - val_accuracy: 0.7682 - val_loss: 0.2142
Epoch 2/50
[1m420/420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.7431 - loss: 0.2337 - val_accuracy: 0.7452 - val_loss: 0.2011
Epoch 3/50
[1m420/420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.7582 - loss: 0.2064 - val_accuracy: 0.7318 - val_loss: 0.1974
Epoch 4/50
[1m420/420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.7658 - loss: 0.1905 - val_accuracy: 0.7470 - val_loss: 0.1963
Epoch 5/50
[1m420/420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.7661 - loss: 0.1850 - val_accuracy: 0.7491 - val_loss: 0.1979
Epoch 6/50
[1m420/420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.7744 - loss: 0.1763 - val_accuracy: 0.7384 - val_loss: 0.2006
Epoch 7/50
[1m420/420[

<keras.src.callbacks.history.History at 0x229557e2f90>

In [118]:
def predict_df(df,model):
    df=df.map(process)  
    title_tfidf = titletf.transform(df["TITLE"])
    abstract_tfidf = abstracttf.transform(df["ABSTRACT"])
    tfidf = np.column_stack((title_tfidf.toarray(), abstract_tfidf.toarray()))
    prediction = model.predict(tfidf)
    prediction = (prediction > 0.5).astype(int)
    return prediction
def predict(title,abstract,model):
    categories = ["Computer Science", "Physics", "Mathematics", "Statistics", "Quantitative Biology", "Quantitative Finance"]
    title=process(title)
    abstract=process(abstract)
    title_tfidf = titletf.transform([title])  
    abstract_tfidf = abstracttf.transform([abstract])  
    tfidf = np.column_stack((title_tfidf.toarray(), abstract_tfidf.toarray()))
    prediction = model.predict(tfidf)
    prediction = (prediction > 0.5).astype(int)
    predicted_labels = [categories[i] for i in range(len(categories)) if prediction[0, i] == 1]
    return predicted_labels

In [119]:
y_train_pred= predict_df(X_train,model)
y_pred=predict_df(X_test,model)

[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step  
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [120]:
f1 = f1_score(y_test, y_pred, average='samples')
f1_train = f1_score(y_train, y_train_pred, average='samples')
print("F1 Score:", f1 , f1_train)

F1 Score: 0.7987604290822407 0.8517255766823627


In [121]:
subset_acc = accuracy_score(y_test, y_pred)
subset_acc_train = accuracy_score(y_train, y_train_pred)
print("Subset Accuracy:", subset_acc ,subset_acc_train)

Subset Accuracy: 0.6483909415971395 0.7205698277403588


In [122]:
title = "Deep Learning for Image Classification"
abstract="This paper explores deep learning techniques applied to image recognition tasks, focusing on advancements in convolutional neural networks (CNNs), transfer learning, and self-supervised learning. We analyze the impact of model architectures such as ResNet, Vision Transformers (ViTs), and EfficientNet in improving classification accuracy across large-scale datasets like ImageNet. Additionally, we discuss the role of data augmentation, hyperparameter tuning, and computational efficiency in optimizing performance. The paper also reviews recent breakthroughs in multimodal learning, integrating textual and visual information to enhance recognition capabilities. Finally, we highlight key challenges, such as interpretability, adversarial robustness, and real-time deployment in edge devices"
predicted_label=predict(title,abstract,model)
print("Predicted Label:", predicted_label)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Predicted Label: ['Computer Science', 'Statistics']
