In [1]:
import dill as pickle
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

labels_ = ['Computer Science', 'Physics', 'Mathematics','Statistics', 'Quantitative Biology', 'Quantitative Finance']
voting_model_file = '../output_files/voting_model.sav'

## reading testing file

In [3]:
raw_testing_data = pd.read_csv('../input_data/test.csv')
# y_test = raw_testing_data[labels_]

In [4]:
raw_testing_data.head()

Unnamed: 0,ID,TITLE,ABSTRACT
0,20973,Closed-form Marginal Likelihood in Gamma-Poiss...,We present novel understandings of the Gamma...
1,20974,Laboratory mid-IR spectra of equilibrated and ...,Meteorites contain minerals from Solar Syste...
2,20975,Case For Static AMSDU Aggregation in WLANs,Frame aggregation is a mechanism by which mu...
3,20976,The $Gaia$-ESO Survey: the inner disk intermed...,Milky Way open clusters are very diverse in ...
4,20977,Witness-Functions versus Interpretation-Functi...,Proving that a cryptographic protocol is cor...


**Preprocessing test data**

In [5]:
# Defnining function again with transform operation

def clean_transform_title_abstract(dataframe, title_vectorizer, abstarct_vectorizer):
    '''
    Takes dataframe, title TFIDF vectorizer and abstract vectorizer as input.
    Returns tfidf vectorized title and abstract columns.
    '''
    def text_clean(text):
        snowball_stemmer = SnowballStemmer("english")
        pattern = r'[^a-zA-Z0-9\s]'
        cleaned_sent = re.sub(pattern,'',text)
        word_tokens = cleaned_sent.split()
        word_tokens_stemmed = [snowball_stemmer.stem(w) for w in word_tokens if w not in stop_words]
        return ' '.join(word_tokens_stemmed)
    
    title_col = dataframe.TITLE
    cleaned_abstarct_col = dataframe.ABSTRACT.map(text_clean)
    
    title_col_tfidf = title_vectorizer.transform(title_col)
    abstract_col_tfidf = abstarct_vectorizer.transform(cleaned_abstarct_col)
    return title_col_tfidf, abstract_col_tfidf

In [6]:
title_vectorizer = pickle.load(open('../output_files/title_tfidf_vectorizer_6000f.sav','rb'))
abstarct_vectorizer = pickle.load(open('../output_files/abstract_tfidf_vectorizer_6000f.sav','rb'))

In [7]:
title_col_vec_test, abstract_col_vec_test = clean_transform_title_abstract(raw_testing_data, 
                                                                 title_vectorizer, 
                                                                 abstarct_vectorizer)
print(title_col_vec_test.shape)
print(abstract_col_vec_test.shape)

title_features_test = pd.DataFrame(title_col_vec_test.todense(), columns=title_vectorizer.vocabulary_)
abstract_features_test = pd.DataFrame(abstract_col_vec_test.todense(), columns=abstarct_vectorizer.vocabulary_)
X_test = pd.concat([title_features_test, abstract_features_test], axis=1)
X_test.shape

(8989, 1000)
(8989, 5000)


(8989, 6000)

In [8]:
X_test.head()

Unnamed: 0,effect,maps,neural,network,neural network,and,poisson,for,functions,finite,...,volunt,browser,pm25,mathcalm,memristor,ecg,lookahead,richclub,pomdp,starless
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Making predictions using voting classifier model

In [9]:
with open(voting_model_file, 'rb') as fread:
    clf_model_voting = pickle.load(fread)

In [10]:
clf_model_voting.get_params

<bound method BaseEstimator.get_params of OneVsRestClassifier(estimator=VotingClassifier(estimators=[('mnb',
                                                            OneVsRestClassifier(estimator=MultinomialNB(alpha=0.1),
                                                                                n_jobs=-1)),
                                                           ('lr',
                                                            OneVsRestClassifier(estimator=LogisticRegression(C=2),
                                                                                n_jobs=-1))],
                                               n_jobs=-1, voting='soft',
                                               weights=[2, 1]),
                    n_jobs=3)>

In [11]:
y_pred = clf_model_voting.predict(X_test)

In [12]:
y_pred[:10]

array([[0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 1, 1, 0, 0],
       [1, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0]])

In [13]:
predictions = pd.DataFrame(y_pred, columns=labels_)

In [14]:
predictions.head(10)

Unnamed: 0,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,0,0,0,1,0,0
1,0,1,0,0,0,0
2,1,0,0,0,0,0
3,0,1,0,0,0,0
4,1,0,0,0,0,0
5,0,0,0,1,0,0
6,0,0,1,1,0,0
7,1,0,0,1,0,0
8,1,0,0,0,0,0
9,0,1,0,0,0,0


### Creating submission file and saving it

In [15]:
submission_file = raw_testing_data.copy()
submission_file.loc[:,labels_] = predictions

In [16]:
submission_file.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6
0,18001,Separation of the charge density wave and supe...,In layered transition metal dichalcogenides ...,0,1,0,0,0,0
1,18002,Zero distribution for Angelesco Hermite--Padé ...,We consider the problem of zero distribution...,0,0,1,0,0,0
2,18003,Atomic-scale identification of novel planar de...,We have discovered two novel types of planar...,0,1,0,0,0,0
3,18004,Self-consistent assessment of Englert-Schwinge...,Our manuscript investigates a self-consisten...,0,1,0,0,0,0
4,18005,Target volatility option pricing in lognormal ...,We examine in this article the pricing of ta...,0,0,0,0,0,1


In [17]:
submission_file.to_csv('../output_files/predictions.csv')