In [163]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
df= pd.read_csv("./dataset-1.csv")

In [165]:
df.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [166]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [167]:
stopWords = stopwords.words('english')

In [168]:
def process(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word not in stopWords]
    return " ".join(words)

In [169]:
X= df[['TITLE','ABSTRACT']]
X.head()


Unnamed: 0,TITLE,ABSTRACT
0,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...
1,Rotation Invariance Neural Network,Rotation invariance and translation invarian...
2,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...
3,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...
4,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...


In [170]:
X=X.map(process)
y=df.iloc[:, 3:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [171]:
X.head()

Unnamed: 0,TITLE,ABSTRACT
0,reconstructing subject-specific effect maps,predictive models allow subject-specific infer...
1,rotation invariance neural network,rotation invariance translation invariance gre...
2,spherical polyharmonics poisson kernels polyha...,introduce develop notion spherical polyharmoni...
3,finite element approximation stochastic maxwel...,stochastic landau -- lifshitz -- gilbert ( llg...
4,comparative study discrete wavelet transforms ...,fourier-transform infra-red ( ftir ) spectra s...


### Using Tf-IDF and combining it with Random Forest Classifier along with OneVsRestClassifier for multi label calssification

In [172]:
titletf = TfidfVectorizer(max_features=1000)
abstracttf= TfidfVectorizer(max_features=1000)
title_tfidf = titletf.fit_transform(X_train["TITLE"])
abstract_tfidf = abstracttf.fit_transform(X_train["ABSTRACT"])
X_train_tfidf = np.column_stack((title_tfidf.toarray(), abstract_tfidf.toarray()))

In [173]:
rf = OneVsRestClassifier(RandomForestClassifier(
    n_estimators=200,   
    max_depth=10,       
    min_samples_split=5,  
    min_samples_leaf=3,  
    max_features="sqrt", 
    class_weight="balanced",  
    n_jobs=-1,
    random_state=42
))



In [174]:
rf.fit(X_train_tfidf, y_train)

In [175]:
def predict_df(df,model):
    df=df.map(process)  
    title_tfidf = titletf.transform(df["TITLE"])
    abstract_tfidf = abstracttf.transform(df["ABSTRACT"])
    tfidf = np.column_stack((title_tfidf.toarray(), abstract_tfidf.toarray()))
    prediction = model.predict(tfidf)
    return prediction

In [176]:
def predict(title,abstract,model):
    categories = ["Computer Science", "Physics", "Mathematics", "Statistics", "Quantitative Biology", "Quantitative Finance"]
    title=process(title)
    abstract=process(abstract)
    title_tfidf = titletf.transform([title])  
    abstract_tfidf = abstracttf.transform([abstract])  
    tfidf = np.column_stack((title_tfidf, abstract_tfidf))
    prediction = model.predict(tfidf)
    print(prediction.shape)
    predicted_labels = [categories[i] for i in range(len(categories)) if prediction[0, i] == 1]
    return predicted_labels

In [177]:
y_train_pred= predict_df(X_train,rf)
y_pred=predict_df(X_test,rf)

In [178]:
from sklearn.metrics import f1_score,accuracy_score
f1 = f1_score(y_test, y_pred, average='samples')
f1_train = f1_score(y_train, y_train_pred, average='samples')
print("F1 Score:", f1 , f1_train)

F1 Score: 0.736797775129122 0.7823164933852184


In [179]:
subset_acc = accuracy_score(y_test, y_pred)
subset_acc_train = accuracy_score(y_train, y_train_pred)
print("Subset Accuracy:", subset_acc ,subset_acc_train)

Subset Accuracy: 0.5215733015494637 0.5765035465220242


In [180]:
title = "Deep Learning for Image Classification"
abstract="This paper explores deep learning techniques applied to image recognition tasks, focusing on advancements in convolutional neural networks (CNNs), transfer learning, and self-supervised learning. We analyze the impact of model architectures such as ResNet, Vision Transformers (ViTs), and EfficientNet in improving classification accuracy across large-scale datasets like ImageNet. Additionally, we discuss the role of data augmentation, hyperparameter tuning, and computational efficiency in optimizing performance. The paper also reviews recent breakthroughs in multimodal learning, integrating textual and visual information to enhance recognition capabilities. Finally, we highlight key challenges, such as interpretability, adversarial robustness, and real-time deployment in edge devices"
predicted_label=predict(title,abstract,rf)
print("Predicted Label:", predicted_label)

ValueError: setting an array element with a sequence.