In [83]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import string

In [84]:
dataset1= pd.read_excel('./Dataset-1.xlsx')
print(dataset1.isnull().sum())
dataset1.head()


ID                      0
TITLE                   0
ABSTRACT                0
Computer Science        0
Physics                 0
Mathematics             0
Statistics              0
Quantitative Biology    0
Quantitative Finance    0
dtype: int64


Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [85]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [86]:
stopWords = set(stopwords.words('english'))

In [87]:
def preprocess(text):
    text=text.lower()
    text = re.sub(r'\d+', '', text) #removing digits/numbers
    text = re.sub(r'\s+', ' ', text).strip() #removing extra space
    text = text.translate(str.maketrans('', '', string.punctuation))  #removing punctuation
    words = word_tokenize(text)
    words = [word for word in words if word not in stopWords] #removing the stop words
    return " ".join(words)

In [88]:
dataset1["text"] = dataset1["TITLE"].astype(str) + " " + dataset1["ABSTRACT"].astype(str)
# dataset1.head()
dataset1["processedText"]=dataset1["text"].apply(preprocess)
print(dataset1[["text", "processedText"]].head())



                                                text  \
0  Reconstructing Subject-Specific Effect Maps   ...   
1  Rotation Invariance Neural Network   Rotation ...   
2  Spherical polyharmonics and Poisson kernels fo...   
3  A finite element approximation for the stochas...   
4  Comparative study of Discrete Wavelet Transfor...   

                                       processedText  
0  reconstructing subjectspecific effect maps pre...  
1  rotation invariance neural network rotation in...  
2  spherical polyharmonics poisson kernels polyha...  
3  finite element approximation stochastic maxwel...  
4  comparative study discrete wavelet transforms ...  


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [90]:
# dataset1.head()

In [91]:
XText=dataset1['processedText']
y = dataset1.iloc[:, 3:-2]
XTrain_text, XTest_text, yTrain, yTest = train_test_split(XText, y, test_size=0.2, random_state=30)

In [92]:
tfidf = TfidfVectorizer(max_features=5000)
XTrain = tfidf.fit_transform(XTrain_text)
XTest=tfidf.transform(XTest_text)

In [93]:
model1_randomForest = OneVsRestClassifier(RandomForestClassifier(n_estimators=100))
model1_randomForest.fit(XTrain, yTrain)


In [None]:
yPred = model1_randomForest.predict(XTest)
print(yPred)
print(classification_report(yTest, yPred))



[[1 0 0 0 0 0]
 [0 1 0 0 0 0]
 [0 1 0 0 0 0]
 ...
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [0 1 0 0 0 0]]


In [95]:
def predictTopic(text,model):
    categories = ["Computer Science", "Physics", "Mathematics", "Statistics", "Quantitative Biology", "Quantitative Finance"]

    processed_text = preprocess(text)  
    text_tfidf = tfidf.transform([processed_text])
    
    prediction = model.predict(text_tfidf)

    predicted_labels = [categories[i] for i in range(len(categories)) if prediction[0, i] == 1]

    return predicted_labels


In [96]:
new_text = "We introduce and develop the notion of spherical polyharmonics, which are a natural generalisation of spherical harmonics. In particular we study the theory of zonal polyharmonics, which allows us, analogously to zonal harmonics, to construct Poisson kernels for polyharmonic functions on the union of rotated balls. We find the representation of Poisson kernels and zonal polyharmonics in terms of the Gegenbauer polynomials."
predicted_label=predictTopic(new_text,model1_randomForest)
print("Predicted Label:", predicted_label)



Predicted Label: ['Mathematics']
