In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import re
import nltk
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NEERAJ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NEERAJ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
train=pd.read_csv('train.csv')
train.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


# Preprocessing

In [3]:
# Load stopwords, common words such as  "a," "the," "it," etc.
stop_words = stopwords.words('english')
    
#Initialize stemmer, which will take words and convert words to their "stem," e.g. Playing-> Play
ps = PorterStemmer() 

# Removes non-alphabetical characters, whitespaces, and converts all letters to lowercase
def clean_text(txt): 
    txt= txt.lower()   #lowercase
    txt= re.sub("[^a-zA-Z]"," ",txt) #Remove everything except alphabetical characters 
    txt= word_tokenize(txt) #tokenize (split into list and remove whitespace)
    
    #initialize list to store clean text
    clean_text=""
    
    #iterate over each word
    for w in txt:      
        #remove stopwords
        if w not in stop_words:
            #stem=ps.stem(w) #stem 
            stem=w
            clean_text = clean_text + stem +" " 
    return clean_text


train['TITLE']=train['TITLE'].apply(clean_text)
train['ABSTRACT']=train['ABSTRACT'].apply(clean_text)

In [4]:
train.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,reconstructing subject specific effect maps,predictive models allow subject specific infer...,1,0,0,0,0,0
1,2,rotation invariance neural network,rotation invariance translation invariance gre...,1,0,0,0,0,0
2,3,spherical polyharmonics poisson kernels polyha...,introduce develop notion spherical polyharmoni...,0,0,1,0,0,0
3,4,finite element approximation stochastic maxwel...,stochastic landau lifshitz gilbert llg equatio...,0,0,1,0,0,0
4,5,comparative study discrete wavelet transforms ...,fourier transform infra red ftir spectra sampl...,1,0,0,1,0,0


# Visualise the target labels

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt
binary_labels=train[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']]
categories = list(binary_labels.columns.values)
ax= sns.barplot(binary_labels.sum().values, categories)

plt.title("Article", fontsize=24)
plt.ylabel('Article Belongs To', fontsize=18)
plt.xlabel('Number of articles', fontsize=18)
#adding the text labels
rects = ax.patches
labels = binary_labels.sum().values
plt.show()

<Figure size 640x480 with 1 Axes>

# Train and Validation split

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# split dataset into training and validation set

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)
xtrain, xval, ytrain, yval = train_test_split(train['ABSTRACT'], binary_labels, test_size=0.2, random_state=9)

# create TF-IDF features
# TF-IDF = Term frequency - inverse document frequency
# Used to predict how important a word is for a document
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

# Model Building

Logistic Regression

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score,f1_score,classification_report

#Run Logistic Regrssion
logreg = LogisticRegression(class_weight='balanced',n_jobs=-1,random_state=9)
logreg_classifier = OneVsRestClassifier(logreg)

# fit model on train data
logreg_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
ytrain_pred=logreg_classifier.predict(xtrain_tfidf)
yval_pred = logreg_classifier.predict(xval_tfidf)

# evaluate performance
from sklearn.metrics import accuracy_score
print()
print("Accuracy score for Logistic Regression train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for Logistic Regression validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))


Accuracy score for Logistic Regression train:  0.718364427490016
Accuracy score for Logistic Regression validation: 0.6417163289630512


f1-Score Train:  0.8750166970924795
f1-Score validation:  0.821957385337667


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.81      0.90      0.85      1704
             Physics       0.89      0.87      0.88      1211
         Mathematics       0.76      0.87      0.82      1089
          Statistics       0.72      0.85      0.78      1069
Quantitative Biology       0.38      0.64      0.48       116
Quantitative Finance       0.54      0.82      0.65        44

           micro avg       0.78      0.87      0.82      5233
           macro avg       0.68      0.82      0.74      5233
        weighted avg       0.79      0.87      0.83      5233
         samples avg       0.82      0.90      0.84      5233



Hyperparameter tuning Logistic Regression

In [94]:

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score,f1_score,classification_report

#Run Logistic Regrssion
logreg1 = LogisticRegression(class_weight='balanced',n_jobs=-1,random_state=9,C=0.07)
logreg1_classifier = OneVsRestClassifier(logreg1)

# fit model on train data
logreg1_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
ytrain_pred=logreg1_classifier.predict(xtrain_tfidf)
yval_pred = logreg1_classifier.predict(xval_tfidf)

# evaluate performance
from sklearn.metrics import accuracy_score
print()
print("Accuracy score for Logistic Regression train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for Logistic Regression validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))



Accuracy score for Logistic Regression train:  0.6141145616021935
Accuracy score for Logistic Regression validation: 0.60119189511323


f1-Score Train:  0.8153880168030069
f1-Score validation:  0.8018851147074515


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.78      0.89      0.83      1704
             Physics       0.90      0.84      0.87      1211
         Mathematics       0.76      0.87      0.81      1089
          Statistics       0.68      0.85      0.76      1069
Quantitative Biology       0.29      0.72      0.41       116
Quantitative Finance       0.51      0.82      0.63        44

           micro avg       0.75      0.86      0.80      5233
           macro avg       0.65      0.83      0.72      5233
        weighted avg       0.77      0.86      0.81      5233
         samples avg       0.80      0.89      0.82      5233



In [97]:
from sklearn.model_selection import KFold,cross_val_score
 
logreg1 = LogisticRegression(class_weight='balanced',n_jobs=-1,random_state=9,C=0.07)
logreg1_classifier = OneVsRestClassifier(logreg1)
fold=KFold(n_splits=10,shuffle=True,random_state=9)
score=cross_val_score(logreg1_classifier,xtrain_tfidf, ytrain,scoring='f1_micro',cv=fold,n_jobs=-1)

In [98]:
score

array([0.79928476, 0.80212249, 0.80363073, 0.80286097, 0.79304193,
       0.78756937, 0.77809545, 0.80322004, 0.80408254, 0.79820022])

In [101]:
print("f1_score: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

f1_score: 0.80 (+/- 0.02)


Multinomial Naive Bayes

In [64]:
# Using Multinomial Naive Bayes 
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB, MultinomialNB


# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = OneVsRestClassifier(MultinomialNB())
# train
classifier.fit(xtrain_tfidf, ytrain)
# predict
ytrain_pred=classifier.predict(xtrain_tfidf)
yval_pred = classifier.predict(xval_tfidf)

print("Accuracy score for Gussian NB train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for Gussian NB validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))

Accuracy score for Gussian NB train:  0.6609048101567623
Accuracy score for Gussian NB validation: 0.634564958283671


f1-Score Train:  0.8210241676525266
f1-Score validation:  0.8028813394334664


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.77      0.89      0.83      1704
             Physics       0.95      0.79      0.86      1211
         Mathematics       0.84      0.78      0.81      1089
          Statistics       0.75      0.76      0.75      1069
Quantitative Biology       0.67      0.02      0.03       116
Quantitative Finance       0.00      0.00      0.00        44

           micro avg       0.82      0.79      0.80      5233
           macro avg       0.66      0.54      0.55      5233
        weighted avg       0.82      0.79      0.79      5233
         samples avg       0.81      0.82      0.79      5233



In [105]:
classifier = OneVsRestClassifier(MultinomialNB())

fold=KFold(n_splits=10,shuffle=True,random_state=9)
scores=cross_val_score(classifier,xtrain_tfidf, ytrain,scoring='f1_micro',cv=fold,n_jobs=-1)

In [106]:
scores

array([0.79539892, 0.79484686, 0.80674322, 0.81271228, 0.79873909,
       0.7861605 , 0.78578384, 0.80774818, 0.80494905, 0.80265748])

In [107]:
print("f1_score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

f1_score: 0.80 (+/- 0.02)


SGDClassifier

In [72]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(random_state=9)


#Run SGDClassifier
sgd_classifier = OneVsRestClassifier(sgd)

# fit model on train data
sgd_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
ytrain_pred=sgd_classifier.predict(xtrain_tfidf)
yval_pred = sgd_classifier.predict(xval_tfidf)

print("Accuracy score for Gussian NB train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for Gussian NB validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))

Accuracy score for Gussian NB train:  0.7593133456517852
Accuracy score for Gussian NB validation: 0.6707985697258642


f1-Score Train:  0.8800039294660836
f1-Score validation:  0.8200378448361717


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.82      0.88      0.85      1704
             Physics       0.95      0.81      0.88      1211
         Mathematics       0.88      0.78      0.83      1089
          Statistics       0.81      0.72      0.76      1069
Quantitative Biology       0.55      0.10      0.17       116
Quantitative Finance       0.72      0.30      0.42        44

           micro avg       0.86      0.79      0.82      5233
           macro avg       0.79      0.60      0.65      5233
        weighted avg       0.85      0.79      0.81      5233
         samples avg       0.83      0.82      0.81      5233



In [109]:
from sklearn.linear_model import SGDClassifier
sgd1 = SGDClassifier(random_state=9,class_weight='balanced',n_jobs=-1,alpha=0.0010)


#Run SGDClassifier
sgd1_classifier = OneVsRestClassifier(sgd1)

# fit model on train data
sgd1_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
ytrain_pred=sgd1_classifier.predict(xtrain_tfidf)
yval_pred = sgd1_classifier.predict(xval_tfidf)

print("Accuracy score for SGDClassifier train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for SGDClassifier validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))

Accuracy score for SGDClassifier train:  0.6367050128151636
Accuracy score for SGDClassifier validation: 0.6169249106078665


f1-Score Train:  0.8282823810683373
f1-Score validation:  0.8103617686467173


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.78      0.91      0.84      1704
             Physics       0.92      0.83      0.87      1211
         Mathematics       0.77      0.87      0.82      1089
          Statistics       0.69      0.87      0.77      1069
Quantitative Biology       0.33      0.72      0.45       116
Quantitative Finance       0.48      0.82      0.61        44

           micro avg       0.76      0.87      0.81      5233
           macro avg       0.66      0.83      0.72      5233
        weighted avg       0.78      0.87      0.82      5233
         samples avg       0.81      0.89      0.82      5233



In [110]:
sgd1_classifier = OneVsRestClassifier(sgd1)

fold=KFold(n_splits=10,shuffle=True,random_state=9)
scores=cross_val_score(sgd1_classifier,xtrain_tfidf, ytrain,scoring='f1_micro',cv=fold,n_jobs=-1)

print(scores)
print('\n')
print("f1_score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.81012091 0.80823738 0.81124143 0.81438463 0.80375084 0.79564638
 0.79117971 0.80846325 0.81745325 0.80802163]


f1_score: 0.81 (+/- 0.02)


KNN Classifier

In [90]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

knnClf = KNeighborsClassifier()
knn_classifier = OneVsRestClassifier(knnClf)
# fit model on train data
knn_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
ytrain_pred=knn_classifier.predict(xtrain_tfidf)
yval_pred = knn_classifier.predict(xval_tfidf)

print("Accuracy score for SGDClassifier train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for SGDClassifier validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))

Accuracy score for SGDClassifier train:  0.6913035703641891
Accuracy score for SGDClassifier validation: 0.5880810488676996


f1-Score Train:  0.8278073810933492
f1-Score validation:  0.7474048442906575


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.76      0.80      0.78      1704
             Physics       0.90      0.73      0.81      1211
         Mathematics       0.75      0.71      0.73      1089
          Statistics       0.73      0.66      0.69      1069
Quantitative Biology       0.46      0.23      0.31       116
Quantitative Finance       0.70      0.48      0.57        44

           micro avg       0.77      0.72      0.75      5233
           macro avg       0.72      0.60      0.65      5233
        weighted avg       0.77      0.72      0.75      5233
         samples avg       0.76      0.75      0.74      5233



In [112]:
knn_classifier = OneVsRestClassifier(knnClf)

fold=KFold(n_splits=10,shuffle=True,random_state=9)
scores=cross_val_score(knn_classifier,xtrain_tfidf, ytrain,scoring='f1_micro',cv=fold,n_jobs=-1)

print(scores)
print('\n')
print("f1_score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.75536695 0.74881988 0.76523031 0.76176325 0.75548902 0.74648235
 0.74041079 0.7541553  0.75987246 0.74949799]


f1_score: 0.75 (+/- 0.01)


# Decision Tree

In [152]:
from sklearn.tree import DecisionTreeClassifier

dt=DecisionTreeClassifier(random_state=9,class_weight='balanced')
dt_classifier = OneVsRestClassifier(dt)
# fit model on train data
dt_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
ytrain_pred=dt_classifier.predict(xtrain_tfidf)
yval_pred = dt_classifier.predict(xval_tfidf)

print("Accuracy score for SGDClassifier train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for SGDClassifier validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))

Accuracy score for SGDClassifier train:  1.0
Accuracy score for SGDClassifier validation: 0.4154946364719905


f1-Score Train:  1.0
f1-Score validation:  0.6648321408915795


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.67      0.73      0.69      1704
             Physics       0.73      0.74      0.74      1211
         Mathematics       0.64      0.70      0.67      1089
          Statistics       0.60      0.63      0.61      1069
Quantitative Biology       0.21      0.34      0.26       116
Quantitative Finance       0.23      0.50      0.32        44

           micro avg       0.64      0.69      0.66      5233
           macro avg       0.51      0.60      0.55      5233
        weighted avg       0.65      0.69      0.67      5233
         samples avg       0.63      0.72      0.65      5233



In [113]:
dt_classifier = OneVsRestClassifier(dt)

fold=KFold(n_splits=10,shuffle=True,random_state=9)
scores=cross_val_score(dt_classifier,xtrain_tfidf, ytrain,scoring='f1_micro',cv=fold,n_jobs=-1)

print(scores)
print('\n')
print("f1_score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.67749529 0.67510549 0.67334765 0.67692308 0.66682386 0.65061379
 0.65978414 0.67754137 0.68122066 0.67964353]


f1_score: 0.67 (+/- 0.02)


Decision Tree Hyperparameter tuning

In [124]:
dt=DecisionTreeClassifier( class_weight='balanced', criterion='entropy', max_depth=7, random_state=9)
dt_classifier = OneVsRestClassifier(dt)
# fit model on train data
dt_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
ytrain_pred=dt_classifier.predict(xtrain_tfidf)
yval_pred = dt_classifier.predict(xval_tfidf)

print("Accuracy score for SGDClassifier train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for SGDClassifier validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))

Accuracy score for SGDClassifier train:  0.3681826309828932
Accuracy score for SGDClassifier validation: 0.33039332538736593


f1-Score Train:  0.6385234832191268
f1-Score validation:  0.6053076234966703


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.68      0.68      0.68      1704
             Physics       0.85      0.46      0.60      1211
         Mathematics       0.72      0.41      0.52      1089
          Statistics       0.57      0.76      0.65      1069
Quantitative Biology       0.17      0.46      0.24       116
Quantitative Finance       0.24      0.55      0.33        44

           micro avg       0.63      0.58      0.61      5233
           macro avg       0.54      0.55      0.50      5233
        weighted avg       0.69      0.58      0.61      5233
         samples avg       0.51      0.59      0.53      5233



In [125]:
dt_classifier = OneVsRestClassifier(dt)

fold=KFold(n_splits=10,shuffle=True,random_state=9)
scores=cross_val_score(dt_classifier,xtrain_tfidf, ytrain,scoring='f1_micro',cv=fold,n_jobs=-1)

print(scores)
print('\n')
print("f1_score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.59883586 0.60016287 0.59055441 0.59114359 0.600085   0.57711681
 0.59846421 0.6031746  0.58825919 0.58619824]


f1_score: 0.59 (+/- 0.02)


# Random Forest Classifier

In [126]:
from sklearn.ensemble import RandomForestClassifier

In [128]:
rf=RandomForestClassifier(random_state=9,class_weight='balanced')
rf_classifier = OneVsRestClassifier(rf)
# fit model on train data
rf_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
ytrain_pred=rf_classifier.predict(xtrain_tfidf)
yval_pred = rf_classifier.predict(xval_tfidf)

print("Accuracy score for SGDClassifier train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for SGDClassifier validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))

Accuracy score for SGDClassifier train:  1.0
Accuracy score for SGDClassifier validation: 0.6317044100119189


f1-Score Train:  1.0
f1-Score validation:  0.7929852852247531


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.78      0.89      0.83      1704
             Physics       0.94      0.77      0.85      1211
         Mathematics       0.92      0.70      0.79      1089
          Statistics       0.79      0.67      0.73      1069
Quantitative Biology       0.00      0.00      0.00       116
Quantitative Finance       1.00      0.05      0.09        44

           micro avg       0.84      0.75      0.79      5233
           macro avg       0.74      0.51      0.55      5233
        weighted avg       0.83      0.75      0.78      5233
         samples avg       0.80      0.79      0.77      5233



In [129]:
rf_classifier = OneVsRestClassifier(rf)

fold=KFold(n_splits=10,shuffle=True,random_state=9)
scores=cross_val_score(rf_classifier,xtrain_tfidf, ytrain,scoring='f1_micro',cv=fold,n_jobs=-1)

print(scores)
print('\n')
print("f1_score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.77983278 0.78648853 0.79283371 0.78638083 0.79056795 0.76992936
 0.77031093 0.78061096 0.78778703 0.77749491]


f1_score: 0.78 (+/- 0.02)


Random forest classifier hyperparameter tuning

In [130]:
rf=RandomForestClassifier(random_state=9,class_weight='balanced',n_jobs=-1,n_estimators=50)
rf_classifier = OneVsRestClassifier(rf)
# fit model on train data
rf_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
ytrain_pred=rf_classifier.predict(xtrain_tfidf)
yval_pred = rf_classifier.predict(xval_tfidf)

print("Accuracy score for SGDClassifier train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for SGDClassifier validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))

Accuracy score for SGDClassifier train:  0.9992251296417715
Accuracy score for SGDClassifier validation: 0.6193087008343265


f1-Score Train:  0.9996908809891809
f1-Score validation:  0.7839675291730087


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.77      0.87      0.82      1704
             Physics       0.94      0.77      0.85      1211
         Mathematics       0.92      0.69      0.79      1089
          Statistics       0.78      0.67      0.72      1069
Quantitative Biology       0.00      0.00      0.00       116
Quantitative Finance       0.50      0.02      0.04        44

           micro avg       0.84      0.74      0.78      5233
           macro avg       0.65      0.50      0.54      5233
        weighted avg       0.82      0.74      0.77      5233
         samples avg       0.79      0.77      0.76      5233



In [131]:
rf_classifier = OneVsRestClassifier(rf)

fold=KFold(n_splits=10,shuffle=True,random_state=9)
scores=cross_val_score(rf_classifier,xtrain_tfidf, ytrain,scoring='f1_micro',cv=fold,n_jobs=-1)

print(scores)
print('\n')
print("f1_score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.775884   0.77859126 0.78738555 0.78007663 0.78457244 0.75830586
 0.7609904  0.76891823 0.78892558 0.76871623]


f1_score: 0.78 (+/- 0.02)


# Bagging Classifier

In [134]:
from sklearn.ensemble import BaggingClassifier

Bagged Logistic Regression

In [135]:
lr=LogisticRegression(class_weight='balanced',n_jobs=-1,random_state=9,C=0.07)
lr_bag=BaggingClassifier(base_estimator=lr,random_state=9,n_jobs=-1)
lr_bag_classifier = OneVsRestClassifier(lr_bag)
# fit model on train data
lr_bag_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
ytrain_pred = lr_bag_classifier.predict(xtrain_tfidf)
yval_pred = lr_bag_classifier.predict(xval_tfidf)

print("Accuracy score for SGDClassifier train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for SGDClassifier validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))

Accuracy score for SGDClassifier train:  0.6204327352923645
Accuracy score for SGDClassifier validation: 0.6097735399284863


f1-Score Train:  0.8170808008580622
f1-Score validation:  0.8059674665228722


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.78      0.88      0.83      1704
             Physics       0.90      0.84      0.87      1211
         Mathematics       0.76      0.87      0.81      1089
          Statistics       0.69      0.85      0.76      1069
Quantitative Biology       0.33      0.66      0.44       116
Quantitative Finance       0.55      0.80      0.65        44

           micro avg       0.76      0.86      0.81      5233
           macro avg       0.67      0.81      0.73      5233
        weighted avg       0.78      0.86      0.81      5233
         samples avg       0.80      0.88      0.82      5233



# Bossting Classifier

Bossted Logistic Regression

In [145]:
from xgboost import XGBClassifier

In [148]:
xgb=XGBClassifier(random_state=9) 

xgb_classifier = OneVsRestClassifier(xgb)
# fit model on train data
xgb_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
ytrain_pred = xgb_classifier.predict(xtrain_tfidf)
yval_pred = xgb_classifier.predict(xval_tfidf)

print("Accuracy score for SGDClassifier train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for SGDClassifier validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))

Accuracy score for SGDClassifier train:  0.8478273827263515
Accuracy score for SGDClassifier validation: 0.6193087008343265


f1-Score Train:  0.9302984418502418
f1-Score validation:  0.7858150312311102


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.80      0.82      0.81      1704
             Physics       0.93      0.77      0.84      1211
         Mathematics       0.85      0.75      0.80      1089
          Statistics       0.78      0.67      0.72      1069
Quantitative Biology       0.61      0.24      0.35       116
Quantitative Finance       0.52      0.27      0.36        44

           micro avg       0.83      0.75      0.79      5233
           macro avg       0.75      0.59      0.65      5233
        weighted avg       0.83      0.75      0.78      5233
         samples avg       0.79      0.78      0.77      5233



In [149]:
from sklearn.ensemble import AdaBoostClassifier

In [151]:
lr=LogisticRegression(class_weight='balanced',n_jobs=-1,random_state=9)
lr_boost=AdaBoostClassifier(base_estimator=lr,random_state=9)

lr_boost_classifier = OneVsRestClassifier(lr_boost)
# fit model on train data
lr_boost_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
ytrain_pred = lr_boost_classifier.predict(xtrain_tfidf)
yval_pred = lr_boost_classifier.predict(xval_tfidf)

print("Accuracy score for SGDClassifier train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for SGDClassifier validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))

Accuracy score for SGDClassifier train:  0.564284437026882
Accuracy score for SGDClassifier validation: 0.564719904648391


f1-Score Train:  0.7807272808050293
f1-Score validation:  0.7787532145074044


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.75      0.85      0.80      1704
             Physics       0.89      0.82      0.86      1211
         Mathematics       0.76      0.85      0.80      1089
          Statistics       0.65      0.84      0.74      1069
Quantitative Biology       0.24      0.72      0.36       116
Quantitative Finance       0.51      0.75      0.61        44

           micro avg       0.73      0.84      0.78      5233
           macro avg       0.64      0.81      0.69      5233
        weighted avg       0.75      0.84      0.79      5233
         samples avg       0.77      0.87      0.79      5233



In [None]:
dt=DecisionTreeClassifier( class_weight='balanced', criterion='entropy', max_depth=7, random_state=9)
dt_boost=AdaBoostClassifier(base_estimator=dt,random_state=9)

dt_boost_classifier = OneVsRestClassifier(dt_boost)
# fit model on train data
dt_boost_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
ytrain_pred = dt_boost_classifier.predict(xtrain_tfidf)
yval_pred = dt_boost_classifier.predict(xval_tfidf)

print("Accuracy score for SGDClassifier train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for SGDClassifier validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))