In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import re
import nltk
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NEERAJ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NEERAJ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

train.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [3]:
test.head()

Unnamed: 0,ID,TITLE,ABSTRACT
0,20973,Closed-form Marginal Likelihood in Gamma-Poiss...,We present novel understandings of the Gamma...
1,20974,Laboratory mid-IR spectra of equilibrated and ...,Meteorites contain minerals from Solar Syste...
2,20975,Case For Static AMSDU Aggregation in WLANs,Frame aggregation is a mechanism by which mu...
3,20976,The $Gaia$-ESO Survey: the inner disk intermed...,Milky Way open clusters are very diverse in ...
4,20977,Witness-Functions versus Interpretation-Functi...,Proving that a cryptographic protocol is cor...


# Preprocessing

In [4]:
# Load stopwords, common words such as  "a," "the," "it," etc.
stop_words = stopwords.words('english')
    
#Initialize stemmer, which will take words and convert words to their "stem," e.g. Playing-> Play
ps = PorterStemmer() 

# Removes non-alphabetical characters, whitespaces, and converts all letters to lowercase
def clean_text(txt): 
    txt= txt.lower()   #lowercase
    txt= re.sub("[^a-zA-Z]"," ",txt) #Remove everything except alphabetical characters 
    txt= word_tokenize(txt) #tokenize (split into list and remove whitespace)
    
    #initialize list to store clean text
    clean_text=""
    
    #iterate over each word
    for w in txt:      
        #remove stopwords
        if w not in stop_words:
            #stem=ps.stem(w) #stem 
            stem=w
            clean_text = clean_text + stem +" " 
    return clean_text


# Train data
train['TITLE']=train['TITLE'].apply(clean_text)
train['ABSTRACT']=train['ABSTRACT'].apply(clean_text)

# Test data
test['TITLE']=test['TITLE'].apply(clean_text)
test['ABSTRACT']=test['ABSTRACT'].apply(clean_text)

In [5]:
train.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,reconstructing subject specific effect maps,predictive models allow subject specific infer...,1,0,0,0,0,0
1,2,rotation invariance neural network,rotation invariance translation invariance gre...,1,0,0,0,0,0
2,3,spherical polyharmonics poisson kernels polyha...,introduce develop notion spherical polyharmoni...,0,0,1,0,0,0
3,4,finite element approximation stochastic maxwel...,stochastic landau lifshitz gilbert llg equatio...,0,0,1,0,0,0
4,5,comparative study discrete wavelet transforms ...,fourier transform infra red ftir spectra sampl...,1,0,0,1,0,0


In [6]:
test.head()

Unnamed: 0,ID,TITLE,ABSTRACT
0,20973,closed form marginal likelihood gamma poisson ...,present novel understandings gamma poisson gap...
1,20974,laboratory mid ir spectra equilibrated igneous...,meteorites contain minerals solar system aster...
2,20975,case static amsdu aggregation wlans,frame aggregation mechanism multiple frames co...
3,20976,gaia eso survey inner disk intermediate age op...,milky way open clusters diverse terms age chem...
4,20977,witness functions versus interpretation functi...,proving cryptographic protocol correct secrecy...


# Visualise the target labels

In [7]:
import seaborn as sns
import matplotlib.pyplot as plt
binary_labels=train[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']]
categories = list(binary_labels.columns.values)
ax= sns.barplot(binary_labels.sum().values, categories)

plt.title("Article", fontsize=24)
plt.ylabel('Article Belongs To', fontsize=18)
plt.xlabel('Number of articles', fontsize=18)
#adding the text labels
rects = ax.patches
labels = binary_labels.sum().values
plt.show()

<Figure size 640x480 with 1 Axes>

# Train and Validation split

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# split dataset into training and validation set

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)
xtrain, xval, ytrain, yval = train_test_split(train['ABSTRACT'], binary_labels, test_size=0.2, random_state=9)

# create TF-IDF features
# TF-IDF = Term frequency - inverse document frequency
# Used to predict how important a word is for a document
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

# Model Building

Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score,f1_score,classification_report

#Run Logistic Regrssion
logreg = LogisticRegression(class_weight='balanced',n_jobs=-1,random_state=9)
logreg_classifier = OneVsRestClassifier(logreg)

# fit model on train data
logreg_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
ytrain_pred=logreg_classifier.predict(xtrain_tfidf)
yval_pred = logreg_classifier.predict(xval_tfidf)

# evaluate performance
from sklearn.metrics import accuracy_score
print()
print("Accuracy score for Logistic Regression train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for Logistic Regression validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))


Accuracy score for Logistic Regression train:  0.718364427490016
Accuracy score for Logistic Regression validation: 0.6417163289630512


f1-Score Train:  0.8750166970924795
f1-Score validation:  0.821957385337667


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.81      0.90      0.85      1704
             Physics       0.89      0.87      0.88      1211
         Mathematics       0.76      0.87      0.82      1089
          Statistics       0.72      0.85      0.78      1069
Quantitative Biology       0.38      0.64      0.48       116
Quantitative Finance       0.54      0.82      0.65        44

           micro avg       0.78      0.87      0.82      5233
           macro avg       0.68      0.82      0.74      5233
        weighted avg       0.79      0.87      0.83      5233
         samples avg       0.82      0.90      0.84      5233



Hyperparameter tuning Logistic Regression

In [10]:

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score,f1_score,classification_report

#Run Logistic Regrssion
logreg1 = LogisticRegression(class_weight='balanced',n_jobs=-1,random_state=9,C=0.07)
logreg1_classifier = OneVsRestClassifier(logreg1)

# fit model on train data
logreg1_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
ytrain_pred=logreg1_classifier.predict(xtrain_tfidf)
yval_pred = logreg1_classifier.predict(xval_tfidf)

# evaluate performance
from sklearn.metrics import accuracy_score
print()
print("Accuracy score for Logistic Regression train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for Logistic Regression validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))



Accuracy score for Logistic Regression train:  0.6141145616021935
Accuracy score for Logistic Regression validation: 0.60119189511323


f1-Score Train:  0.8153880168030069
f1-Score validation:  0.8018851147074515


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.78      0.89      0.83      1704
             Physics       0.90      0.84      0.87      1211
         Mathematics       0.76      0.87      0.81      1089
          Statistics       0.68      0.85      0.76      1069
Quantitative Biology       0.29      0.72      0.41       116
Quantitative Finance       0.51      0.82      0.63        44

           micro avg       0.75      0.86      0.80      5233
           macro avg       0.65      0.83      0.72      5233
        weighted avg       0.77      0.86      0.81      5233
         samples avg       0.80      0.89      0.82      5233



In [11]:
from sklearn.model_selection import KFold,cross_val_score,StratifiedKFold
 
logreg1 = LogisticRegression(class_weight='balanced',n_jobs=-1,random_state=9)
logreg1_classifier = OneVsRestClassifier(logreg1)
fold=KFold(n_splits=10,shuffle=True,random_state=9)
score=cross_val_score(logreg1_classifier,xtrain_tfidf, ytrain,scoring='f1_micro',cv=fold,n_jobs=-1)

In [12]:
score

array([0.82057091, 0.81669284, 0.82466802, 0.82742961, 0.81446328,
       0.80876761, 0.79848754, 0.82181329, 0.82743063, 0.81916113])

In [13]:
print("f1_score: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

f1_score: 0.82 (+/- 0.02)


Multinomial Naive Bayes

In [14]:
# Using Multinomial Naive Bayes 
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB, MultinomialNB


# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = OneVsRestClassifier(MultinomialNB())
# train
classifier.fit(xtrain_tfidf, ytrain)
# predict
ytrain_pred=classifier.predict(xtrain_tfidf)
yval_pred = classifier.predict(xval_tfidf)

print("Accuracy score for Gussian NB train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for Gussian NB validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))

Accuracy score for Gussian NB train:  0.6609048101567623
Accuracy score for Gussian NB validation: 0.634564958283671


f1-Score Train:  0.8210241676525266
f1-Score validation:  0.8028813394334664


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.77      0.89      0.83      1704
             Physics       0.95      0.79      0.86      1211
         Mathematics       0.84      0.78      0.81      1089
          Statistics       0.75      0.76      0.75      1069
Quantitative Biology       0.67      0.02      0.03       116
Quantitative Finance       0.00      0.00      0.00        44

           micro avg       0.82      0.79      0.80      5233
           macro avg       0.66      0.54      0.55      5233
        weighted avg       0.82      0.79      0.79      5233
         samples avg       0.81      0.82      0.79      5233



In [15]:
classifier = OneVsRestClassifier(MultinomialNB())

fold=KFold(n_splits=10,shuffle=True,random_state=9)
scores=cross_val_score(classifier,xtrain_tfidf, ytrain,scoring='f1_micro',cv=fold,n_jobs=-1)

In [16]:
scores

array([0.79539892, 0.79484686, 0.80674322, 0.81271228, 0.79873909,
       0.7861605 , 0.78578384, 0.80774818, 0.80494905, 0.80265748])

In [17]:
print("f1_score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

f1_score: 0.80 (+/- 0.02)


SGDClassifier

In [61]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(random_state=9,class_weight='balanced',loss='modified_huber')


#Run SGDClassifier
sgd_classifier = OneVsRestClassifier(sgd)

# fit model on train data
sgd_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
ytrain_pred=sgd_classifier.predict(xtrain_tfidf)
yval_pred = sgd_classifier.predict(xval_tfidf)

print("Accuracy score for SGDClassifier train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for SGDClassifier validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))

Accuracy score for SGDClassifier train:  0.8432377659891518
Accuracy score for SGDClassifier validation: 0.6390941597139451


f1-Score Train:  0.9357370671130313
f1-Score validation:  0.817109953386345


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.81      0.88      0.84      1704
             Physics       0.88      0.87      0.87      1211
         Mathematics       0.77      0.87      0.82      1089
          Statistics       0.71      0.83      0.76      1069
Quantitative Biology       0.45      0.53      0.48       116
Quantitative Finance       0.63      0.70      0.67        44

           micro avg       0.78      0.85      0.82      5233
           macro avg       0.71      0.78      0.74      5233
        weighted avg       0.79      0.85      0.82      5233
         samples avg       0.82      0.88      0.83      5233



In [59]:
sgd_classifier = OneVsRestClassifier(sgd)

fold=KFold(n_splits=10,shuffle=True,random_state=9)
scores=cross_val_score(sgd_classifier,xtrain_tfidf, ytrain,scoring='f1_micro',cv=fold,n_jobs=-1)

print(scores)
print('\n')
print("f1_score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.82435491 0.81803974 0.82226212 0.82371577 0.81834532 0.80870918
 0.80754717 0.82137268 0.82848772 0.82786885]


f1_score: 0.82 (+/- 0.01)


In [21]:
from sklearn.linear_model import SGDClassifier
sgd1 = SGDClassifier(random_state=9,class_weight='balanced',n_jobs=-1,alpha=0.0010)


#Run SGDClassifier
sgd1_classifier = OneVsRestClassifier(sgd1)

# fit model on train data
sgd1_classifier.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
ytrain_pred=sgd1_classifier.predict(xtrain_tfidf)
yval_pred = sgd1_classifier.predict(xval_tfidf)

print("Accuracy score for SGDClassifier train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for SGDClassifier validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))

Accuracy score for SGDClassifier train:  0.6367050128151636
Accuracy score for SGDClassifier validation: 0.6169249106078665


f1-Score Train:  0.8282823810683373
f1-Score validation:  0.8103617686467173


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.78      0.91      0.84      1704
             Physics       0.92      0.83      0.87      1211
         Mathematics       0.77      0.87      0.82      1089
          Statistics       0.69      0.87      0.77      1069
Quantitative Biology       0.33      0.72      0.45       116
Quantitative Finance       0.48      0.82      0.61        44

           micro avg       0.76      0.87      0.81      5233
           macro avg       0.66      0.83      0.72      5233
        weighted avg       0.78      0.87      0.82      5233
         samples avg       0.81      0.89      0.82      5233



In [22]:
sgd1_classifier = OneVsRestClassifier(sgd1)

fold=KFold(n_splits=10,shuffle=True,random_state=9)
scores=cross_val_score(sgd1_classifier,xtrain_tfidf, ytrain,scoring='f1_micro',cv=fold,n_jobs=-1)

print(scores)
print('\n')
print("f1_score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.81012091 0.80823738 0.81124143 0.81438463 0.80375084 0.79564638
 0.79117971 0.80846325 0.81745325 0.80802163]


f1_score: 0.81 (+/- 0.02)


# Final Submition

In [38]:
X= train['ABSTRACT']
X_test= test['ABSTRACT']
y=binary_labels

# create TF-IDF features
# TF-IDF = Term frequency - inverse document frequency
# Used to predict how important a word is for a document
X = tfidf_vectorizer.fit_transform(X)
X_test = tfidf_vectorizer.transform(X_test)

In [77]:
# Voting Classifier

from sklearn.ensemble import VotingClassifier
SGDClassifier()
vote_est = [ 
    ('LR', logreg_classifier),
    ('SGD', sgd_classifier)]


voting = VotingClassifier(estimators = vote_est,voting= 'soft',weights=[4,1])
voting_classifier=OneVsRestClassifier(voting)
# fit model on train data
voting_classifier.fit(xtrain_tfidf, ytrain)



# make predictions for validation set
ytrain_pred=voting_classifier.predict(xtrain_tfidf)
yval_pred = voting_classifier.predict(xval_tfidf)

print("Accuracy score for SGDClassifier train: ",accuracy_score(ytrain, ytrain_pred))
print("Accuracy score for SGDClassifier validation:",accuracy_score(yval, yval_pred))

print('\n')

print('f1-Score Train: ',f1_score(y_true=ytrain, y_pred=ytrain_pred, average='micro'))
print('f1-Score validation: ',f1_score(y_true=yval, y_pred=yval_pred, average='micro'))

print('\n')

print("Classification report:")
print(classification_report(yval, yval_pred, target_names=binary_labels.columns))

Accuracy score for SGDClassifier train:  0.745127257554986
Accuracy score for SGDClassifier validation: 0.6424314660309892


f1-Score Train:  0.889380037059362
f1-Score validation:  0.8217615642255817


Classification report:
                      precision    recall  f1-score   support

    Computer Science       0.81      0.89      0.85      1704
             Physics       0.89      0.87      0.88      1211
         Mathematics       0.77      0.88      0.82      1089
          Statistics       0.72      0.84      0.78      1069
Quantitative Biology       0.40      0.64      0.49       116
Quantitative Finance       0.56      0.82      0.67        44

           micro avg       0.78      0.87      0.82      5233
           macro avg       0.69      0.82      0.75      5233
        weighted avg       0.79      0.87      0.82      5233
         samples avg       0.82      0.89      0.83      5233



In [63]:
voting_classifier = OneVsRestClassifier(voting)

fold=KFold(n_splits=10,shuffle=True,random_state=9)
scores=cross_val_score(voting_classifier,xtrain_tfidf, ytrain,scoring='f1_micro',cv=fold,n_jobs=-1)

print(scores)
print('\n')
print("f1_score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.82393556 0.81587302 0.82736602 0.82236539 0.81371429 0.8060442
 0.79801399 0.81845035 0.82862352 0.82287567]


f1_score: 0.82 (+/- 0.02)


In [64]:
from sklearn.model_selection import cross_val_score

In [76]:
column_names=['w1','w2', 'score']
score_frame=pd.DataFrame(columns=column_names)
i=0
for w1 in range(1,5):
    for w2 in range(1,5):
        if len(set((w1,w2)))==1:
            continue
        voting=VotingClassifier(estimators=[('LR', logreg_classifier),('SGD', sgd_classifier)],weights=[w1,w2],voting='soft')
        voting_classifier = OneVsRestClassifier(voting)
        kfold = KFold(shuffle=True,n_splits=10,random_state=9)
        score = cross_val_score(voting_classifier,xtrain_tfidf, ytrain,cv=kfold,n_jobs=-1,scoring='f1_micro')
        score_frame.loc[i]=[w1,w2,np.mean(score)]
        i=i+1
score_frame.sort_values(by='score',ascending=False)

Unnamed: 0,w1,w2,score
9,4.0,1.0,0.81938
6,3.0,1.0,0.81932
3,2.0,1.0,0.818738
10,4.0,2.0,0.818738
7,3.0,2.0,0.818436
11,4.0,3.0,0.818173
8,3.0,4.0,0.817052
4,2.0,3.0,0.816836
0,1.0,2.0,0.816126
5,2.0,4.0,0.816126


In [78]:
voting_classifier = OneVsRestClassifier(voting)

fold=KFold(n_splits=10,shuffle=True,random_state=9)
scores=cross_val_score(voting_classifier,xtrain_tfidf, ytrain,scoring='f1_micro',cv=fold,n_jobs=-1)

print(scores)
print('\n')
print("f1_score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.82280502 0.82051282 0.82613611 0.82837529 0.8157237  0.80995069
 0.79776286 0.82134781 0.82800816 0.82317774]


f1_score: 0.82 (+/- 0.02)


In [80]:
SGDClassifier()
vote_est = [ 
    ('LR', logreg_classifier),
    ('SGD', sgd_classifier)]


voting = VotingClassifier(estimators = vote_est,voting= 'soft',weights=[4,1])
voting_classifier=OneVsRestClassifier(voting)
# fit model on full train data
voting_classifier.fit(X,y)



# make predictions for test data set
predict_y=voting_classifier.predict(X)

print("Accuracy score for Logistic Regression validation:",accuracy_score(y, predict_y))

print('\n')

print('f1-Score full train data: ',f1_score(y_true=y, y_pred=predict_y, average='micro'))

Accuracy score for Logistic Regression validation: 0.7332633988174709


f1-Score full train data:  0.8832336932274472


In [81]:
voting_classifier.fit(X,y)
y_pred = voting_classifier.predict(X_test)

col=['ID','Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']
submit_test = pd.concat([test['ID'],pd.DataFrame(y_pred)], axis=1)
submit_test.columns=col
submit_test.to_csv('sample_submission.csv', index=False)
submit_test


Unnamed: 0,ID,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,20973,0,0,0,1,0,0
1,20974,0,1,0,0,0,0
2,20975,1,0,0,0,0,0
3,20976,0,1,0,0,0,0
4,20977,1,0,1,0,0,0
...,...,...,...,...,...,...,...
8984,29957,1,0,0,0,0,0
8985,29958,1,0,1,0,0,0
8986,29959,0,0,0,1,1,0
8987,29960,0,0,0,1,0,0
