In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import re
import nltk
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NEERAJ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NEERAJ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
train=pd.read_csv('train.csv')
train.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


# Preprocessing

In [3]:
# Load stopwords, common words such as  "a," "the," "it," etc.
stop_words = stopwords.words('english')
    
#Initialize stemmer, which will take words and convert words to their "stem," e.g. Playing-> Play
ps = PorterStemmer() 

# Removes non-alphabetical characters, whitespaces, and converts all letters to lowercase
def clean_text(txt): 
    txt= txt.lower()   #lowercase
    txt= re.sub("[^a-zA-Z]"," ",txt) #Remove everything except alphabetical characters 
    txt= word_tokenize(txt) #tokenize (split into list and remove whitespace)
    
    #initialize list to store clean text
    clean_text=""
    
    #iterate over each word
    for w in txt:      
        #remove stopwords
        if w not in stop_words:
            #stem=ps.stem(w) #stem 
            stem=w
            clean_text = clean_text + stem +" " 
    return clean_text


train['TITLE']=train['TITLE'].apply(clean_text)
train['ABSTRACT']=train['ABSTRACT'].apply(clean_text)

In [4]:
train.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,reconstructing subject specific effect maps,predictive models allow subject specific infer...,1,0,0,0,0,0
1,2,rotation invariance neural network,rotation invariance translation invariance gre...,1,0,0,0,0,0
2,3,spherical polyharmonics poisson kernels polyha...,introduce develop notion spherical polyharmoni...,0,0,1,0,0,0
3,4,finite element approximation stochastic maxwel...,stochastic landau lifshitz gilbert llg equatio...,0,0,1,0,0,0
4,5,comparative study discrete wavelet transforms ...,fourier transform infra red ftir spectra sampl...,1,0,0,1,0,0


# Visualise the target labels

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt
binary_labels=train[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']]
categories = list(binary_labels.columns.values)
ax= sns.barplot(binary_labels.sum().values, categories)

plt.title("Article", fontsize=24)
plt.ylabel('Article Belongs To', fontsize=18)
plt.xlabel('Number of articles', fontsize=18)
#adding the text labels
rects = ax.patches
labels = binary_labels.sum().values
plt.show()

<Figure size 640x480 with 1 Axes>

# Train and Validation split

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# split dataset into training and validation set

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)
xtrain, xval, ytrain, yval = train_test_split(train['ABSTRACT'], binary_labels, test_size=0.2, random_state=9)

# create TF-IDF features
# TF-IDF = Term frequency - inverse document frequency
# Used to predict how important a word is for a document
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

# Model Building

Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score,f1_score,classification_report

#Run Logistic Regrssion
logreg = LogisticRegression()
logreg_classifier = OneVsRestClassifier(logreg)

# fit model on train data
logreg_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
predictions = logreg_classifier.predict(xval_tfidf)

# evaluate performance
from sklearn.metrics import accuracy_score
print("Accuracy score for Logistic Regression:")
print(accuracy_score(yval, predictions))

print('f1-Score: ',f1_score(y_true=yval, y_pred=predictions, average='micro'))

print("Classification report:")
print(classification_report(yval, predictions, target_names=binary_labels.columns))

Accuracy score for Logistic Regression:
0.6584028605482718
f1-Score:  0.8094270621698496
Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.83      0.85      0.84      1704
             Physics       0.96      0.80      0.87      1211
         Mathematics       0.87      0.77      0.82      1089
          Statistics       0.83      0.67      0.74      1069
Quantitative Biology       0.62      0.07      0.12       116
Quantitative Finance       0.60      0.07      0.12        44

           micro avg       0.86      0.76      0.81      5233
           macro avg       0.78      0.54      0.59      5233
        weighted avg       0.86      0.76      0.80      5233
         samples avg       0.82      0.80      0.79      5233



  _warn_prf(average, modifier, msg_start, len(result))


Gaussian Naive Bayes

In [9]:
# Using Gaussian Naive Bayes 
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB, MultinomialNB


# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())
# train
classifier.fit(xtrain_tfidf, ytrain)
# predict
predictions = classifier.predict(xval_tfidf)

print("Accuracy score for Gaussian Naive Bayes:")
print(accuracy_score(yval, predictions))
print('f1-Score: ',f1_score(y_true=yval, y_pred=predictions, average='micro'))

print("Classification report:")
print(classification_report(yval, predictions, target_names=binary_labels.columns))

Accuracy score for Gaussian Naive Bayes:
0.4307508939213349
f1-Score:  0.7034911525585844
Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.65      0.92      0.76      1704
             Physics       0.71      0.85      0.77      1211
         Mathematics       0.61      0.85      0.71      1089
          Statistics       0.48      0.82      0.61      1069
Quantitative Biology       0.08      0.06      0.07       116
Quantitative Finance       0.12      0.02      0.04        44

           micro avg       0.60      0.84      0.70      5233
           macro avg       0.44      0.59      0.49      5233
        weighted avg       0.60      0.84      0.70      5233
         samples avg       0.67      0.86      0.72      5233



  _warn_prf(average, modifier, msg_start, len(result))


Multinomial Naive Bayes

In [10]:
# Using Multinomial Naive Bayes 
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB, MultinomialNB


# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(MultinomialNB())
# train
classifier.fit(xtrain_tfidf, ytrain)
# predict
predictions = classifier.predict(xval_tfidf)

print("Accuracy score for Gaussian Naive Bayes:")
print(accuracy_score(yval, predictions))
print('f1-Score: ',f1_score(y_true=yval, y_pred=predictions, average='micro'))

print("Classification report:")
print(classification_report(yval, predictions, target_names=binary_labels.columns))

Accuracy score for Gaussian Naive Bayes:
0.634564958283671
f1-Score:  0.8028813394334664
Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.77      0.89      0.83      1704
             Physics       0.95      0.79      0.86      1211
         Mathematics       0.84      0.78      0.81      1089
          Statistics       0.75      0.76      0.75      1069
Quantitative Biology       0.67      0.02      0.03       116
Quantitative Finance       0.00      0.00      0.00        44

           micro avg       0.82      0.79      0.80      5233
           macro avg       0.66      0.54      0.55      5233
        weighted avg       0.82      0.79      0.79      5233
         samples avg       0.81      0.82      0.79      5233



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SGDClassifier

In [12]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(random_state=9)


#Run SGDClassifier
sgd_classifier = BinaryRelevance(sgd)

# fit model on train data
sgd_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
predictions = sgd_classifier.predict(xval_tfidf)

# evaluate performance
from sklearn.metrics import accuracy_score
print("Accuracy score for Logistic Regression:")
print(accuracy_score(yval, predictions))
print('f1-Score: ',f1_score(y_true=yval, y_pred=predictions, average='micro'))

print("Classification report:")
print(classification_report(yval, predictions, target_names=binary_labels.columns))

Accuracy score for Logistic Regression:
0.6717520858164482
f1-Score:  0.8179635927185438
Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.82      0.86      0.84      1704
             Physics       0.95      0.81      0.87      1211
         Mathematics       0.88      0.78      0.83      1089
          Statistics       0.82      0.71      0.76      1069
Quantitative Biology       0.47      0.14      0.21       116
Quantitative Finance       0.72      0.30      0.42        44

           micro avg       0.86      0.78      0.82      5233
           macro avg       0.78      0.60      0.66      5233
        weighted avg       0.85      0.78      0.81      5233
         samples avg       0.83      0.82      0.81      5233



  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(random_state=9)


#Run SGDClassifier
sgd_classifier = OneVsRestClassifier(sgd)

# fit model on train data
sgd_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
predictions = sgd_classifier.predict(xval_tfidf)

# evaluate performance
from sklearn.metrics import accuracy_score
print("Accuracy score for Logistic Regression:")
print(accuracy_score(yval, predictions))
print('f1-Score: ',f1_score(y_true=yval, y_pred=predictions, average='micro'))

print("Classification report:")
print(classification_report(yval, predictions, target_names=binary_labels.columns))

Accuracy score for Logistic Regression:
0.6707985697258642
f1-Score:  0.8200378448361717
Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.82      0.88      0.85      1704
             Physics       0.95      0.81      0.88      1211
         Mathematics       0.88      0.78      0.83      1089
          Statistics       0.81      0.72      0.76      1069
Quantitative Biology       0.55      0.10      0.17       116
Quantitative Finance       0.72      0.30      0.42        44

           micro avg       0.86      0.79      0.82      5233
           macro avg       0.79      0.60      0.65      5233
        weighted avg       0.85      0.79      0.81      5233
         samples avg       0.83      0.82      0.81      5233



  _warn_prf(average, modifier, msg_start, len(result))


KNN Classifier

In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

knnClf = KNeighborsClassifier()
classifier = OneVsRestClassifier(knnClf)
# fit model on train data
classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
predictions = classifier.predict(xval_tfidf)

# evaluate performance
from sklearn.metrics import accuracy_score
print("Accuracy score for Logistic Regression:")
print(accuracy_score(yval, predictions))

print('f1-Score: ',f1_score(y_true=yval, y_pred=predictions, average='micro'))

print("Classification report:")
print(classification_report(yval, predictions, target_names=binary_labels.columns))

Accuracy score for Logistic Regression:
0.5880810488676996
f1-Score:  0.7474048442906575
Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.76      0.80      0.78      1704
             Physics       0.90      0.73      0.81      1211
         Mathematics       0.75      0.71      0.73      1089
          Statistics       0.73      0.66      0.69      1069
Quantitative Biology       0.46      0.23      0.31       116
Quantitative Finance       0.70      0.48      0.57        44

           micro avg       0.77      0.72      0.75      5233
           macro avg       0.72      0.60      0.65      5233
        weighted avg       0.77      0.72      0.75      5233
         samples avg       0.76      0.75      0.74      5233



  _warn_prf(average, modifier, msg_start, len(result))
