## Required Packages

In [2]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import warnings
warnings.simplefilter('ignore')

## Loading Dataset

In [3]:
train = pd.read_csv('../input/hacklive-3-guided-hackathon-nlp/Train.csv')
test = pd.read_csv('../input/hacklive-3-guided-hackathon-nlp/Test.csv')
ss = pd.read_csv('../input/hacklive-3-guided-hackathon-nlp/SampleSubmission.csv')
tags = pd.read_csv('../input/hacklive-3-guided-hackathon-nlp/Tags.csv')
train.head()

Unnamed: 0,id,ABSTRACT,Computer Science,Mathematics,Physics,Statistics,Analysis of PDEs,Applications,Artificial Intelligence,Astrophysics of Galaxies,...,Methodology,Number Theory,Optimization and Control,Representation Theory,Robotics,Social and Information Networks,Statistics Theory,Strongly Correlated Electrons,Superconductivity,Systems and Control
0,1824,a ever-growing datasets inside observational a...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3094,we propose the framework considering optimal $...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8463,nanostructures with open shell transition meta...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,2082,stars are self-gravitating fluids inside which...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8687,deep neural perception and control networks ar...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
print(train.isnull().sum().sum())
print(test.isnull().sum().sum())

0
0


## Text Preprocessing

In [5]:
TARGET_COLS = ['Analysis of PDEs', 'Applications',
               'Artificial Intelligence', 'Astrophysics of Galaxies',
               'Computation and Language', 'Computer Vision and Pattern Recognition',
               'Cosmology and Nongalactic Astrophysics',
               'Data Structures and Algorithms', 'Differential Geometry',
               'Earth and Planetary Astrophysics', 'Fluid Dynamics',
               'Information Theory', 'Instrumentation and Methods for Astrophysics',
               'Machine Learning', 'Materials Science', 'Methodology', 'Number Theory',
               'Optimization and Control', 'Representation Theory', 'Robotics',
               'Social and Information Networks', 'Statistics Theory',
               'Strongly Correlated Electrons', 'Superconductivity',
               'Systems and Control']

TOPIC_COLS = ['Computer Science', 'Mathematics', 'Physics', 'Statistics']

In [6]:
sw = stopwords.words('english')
np.array(sw)
def stopwords(text):
    '''a function for removing the stopword'''
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    return " ".join(text)
test['ABSTRACT'] = test['ABSTRACT'].apply(stopwords)
train['ABSTRACT'] = train['ABSTRACT'].apply(stopwords)

In [7]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,irrelevents'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    return text
train['ABSTRACT'] = train['ABSTRACT'].apply(lambda x: clean_text(x))
test['ABSTRACT'] = test['ABSTRACT'].apply(lambda x: clean_text(x))

## Data Preparation

In [8]:
vec = CountVectorizer(max_features=10000,binary=True)
_ = vec.fit(list(train['ABSTRACT']) + list(test['ABSTRACT']))

X_test = vec.transform(test['ABSTRACT'])
X_train = vec.transform(train['ABSTRACT'])
print(X_test.shape, X_test.shape)

X_test = np.hstack((X_test.toarray(), test[TOPIC_COLS]))
X_train =  np.hstack((X_train.toarray(), train[TOPIC_COLS]))

print(X_test.shape, X_test.shape)

(6002, 10000) (6002, 10000)
(6002, 10004) (6002, 10004)


In [9]:
test = csr_matrix(X_test.astype('int16'))
X = csr_matrix(X_train.astype('int16'))
y = train[TARGET_COLS]

## Model Building

In [10]:
X_train, X_test, y_train, y_test  = train_test_split(X,y,test_size=0.2,random_state = 42)

In [11]:
xgb = OneVsRestClassifier(XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1))
cb = OneVsRestClassifier(CatBoostClassifier(iterations=200,random_seed=42))


In [12]:
def get_best_thresholds(true, preds):
    '''Finding best threshold values'''
    thresholds = [i/100 for i in range(100)]
    best_thresholds = []
    for idx in range(25):
        f1_scores = [f1_score(true[:, idx], (preds[:, idx] > thresh) * 1) for thresh in thresholds]
        best_thresh = thresholds[np.argmax(f1_scores)]
        best_thresholds.append(best_thresh)
      
    return best_thresholds

In [13]:
xgb  = xgb.fit(X_train, y_train)

val_preds = xgb.predict_proba(X_test)
best_thresholds = get_best_thresholds(y_test, val_preds)

val_preds_train = xgb.predict_proba(X_train)
best_thresholds_train_XGB = get_best_thresholds(y_train, val_preds_train)

for i, thresh in enumerate(best_thresholds):
    val_preds[:, i] = (val_preds[:, i] > thresh) * 1
    
for i, thresh in enumerate(best_thresholds_train_XGB):
    val_preds_train[:, i] = (val_preds_train[:, i] > thresh) * 1    

print('test:',f1_score(y_test, val_preds, average='micro'))
print('train:',f1_score(y_train, val_preds_train, average='micro'))


test: 0.751251559958598
train: 0.925190731845154


In [14]:
cb = cb.fit(X_train, y_train)

val_preds = cb.predict_proba(X_test)
best_thresholds = get_best_thresholds(y_test, val_preds)

val_preds_train = cb.predict_proba(X_train)
best_thresholds_train_cb = get_best_thresholds(y_train, val_preds_train)

for i, thresh in enumerate(best_thresholds):
    val_preds[:, i] = (val_preds[:, i] > thresh) * 1
    
for i, thresh in enumerate(best_thresholds_train_cb):
    val_preds_train[:, i] = (val_preds_train[:, i] > thresh) * 1    

print('test:',f1_score(y_test, val_preds, average='micro'))
print('train:',f1_score(y_train, val_preds_train, average='micro'))


test: 0.742497006719609
train: 0.9325414499815


## Ensembling

In [None]:
xgb  = xgb.fit(X,y)
cb  = cb.fit(X,y)

In [None]:
pred_cat = cb.predict_proba(test)
for i, thresh in enumerate(best_thresholds_train_cb):
    pred_cat[:, i] = (pred_cat[:, i] > thresh) * 1
    
pred_xgb = xgb.predict_proba(test)
for i, thresh in enumerate(best_thresholds_train_XGB):
    pred_xgb[:, i] = (pred_xgb[:, i] > thresh) * 1

In [None]:
w1 = 0.3
w2 = 0.7
final = (w1*pred_cat) + (w2*pred_xgb)
for i, thresh in enumerate(best_thresholds_train_XGB):
    final[:, i] = (final[:, i] > thresh) * 1

## Submission

In [None]:
ss[TARGET_COLS] = final
ss.to_csv('submission.csv', index=False)