In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

Using TensorFlow backend.


Let's load the datasets

In [2]:
train = pd.read_csv('../input/independence-data-av/train_new.csv')
test = pd.read_csv('../input/independence-data-av/test.csv')
sample_submission = pd.read_csv('../input/independence-data-av/sample_submission.csv')

A quick look at the data

In [3]:
train.head()

Unnamed: 0,ABSTRACT,stream
0,Predictive models allow subject-specific inf...,100000
1,Rotation invariance and translation invarian...,100000
2,We introduce and develop the notion of spher...,1000
3,The stochastic Landau--Lifshitz--Gilbert (LL...,1000
4,Fourier-transform infra-red (FTIR) spectra o...,100100


In [4]:
test.head()

Unnamed: 0,ID,TITLE,ABSTRACT
0,20973,Closed-form Marginal Likelihood in Gamma-Poiss...,We present novel understandings of the Gamma...
1,20974,Laboratory mid-IR spectra of equilibrated and ...,Meteorites contain minerals from Solar Syste...
2,20975,Case For Static AMSDU Aggregation in WLANs,Frame aggregation is a mechanism by which mu...
3,20976,The $Gaia$-ESO Survey: the inner disk intermed...,Milky Way open clusters are very diverse in ...
4,20977,Witness-Functions versus Interpretation-Functi...,Proving that a cryptographic protocol is cor...


In [5]:
sample_submission.head()

Unnamed: 0,ID,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,20973,0,0,0,0,0,0
1,20974,0,0,0,0,0,0
2,20975,0,0,0,0,0,0
3,20976,0,0,0,0,0,0
4,20977,0,0,0,0,0,0


In [6]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

We use the LabelEncoder from scikit-learn to convert text labels to integers, 0, 1 2

In [7]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.stream.values)

In [8]:
y

array([13, 13,  6, ..., 13,  7,  7])

In [9]:
max(y),min(y)

(23, 0)

Before going further it is important that we split the data into training and validation sets. We can do it using `train_test_split` from the `model_selection` module of scikit-learn.

In [14]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.ABSTRACT.values, y,
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [15]:
print (xtrain.shape)
print (xvalid.shape)

(18874,)
(2098,)


In [17]:
p=lbl_enc.inverse_transform(y)
p

array([100000, 100000,   1000, ..., 100000,   1100,   1100])

## Building Basic Models

Let's start building our very first model. 

Our very first model is a simple TF-IDF (Term Frequency - Inverse Document Frequency) followed by a simple Logistic Regression.

In [18]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

In [20]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [30]:
len(xtrain_svd_scl[0])

120

Now it's time to apply SVM. After running the following cell, feel free to go for a walk or talk to your girlfriend/boyfriend. :P

In [21]:
# Fitting a simple SVM
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.941 


In [22]:
y_pred=clf.predict(xvalid_svd_scl)
from sklearn.metrics import f1_score
f1_score(yvalid, y_pred, average='micro')

0.6959008579599618

In [24]:
test_id=test['ID']

In [26]:
test.head(2)

Unnamed: 0,ID,TITLE,ABSTRACT
0,20973,Closed-form Marginal Likelihood in Gamma-Poiss...,We present novel understandings of the Gamma...
1,20974,Laboratory mid-IR spectra of equilibrated and ...,Meteorites contain minerals from Solar Syste...


In [27]:
test.drop(['ID','TITLE'],axis=1,inplace=True)

In [31]:
test_tfv =  tfv.transform(test.ABSTRACT.values) 

test_svd = svd.transform(test_tfv)
test_svd_scl = scl.transform(test_svd)

In [33]:
len(test_svd_scl[0])

120

In [34]:
pred=clf.predict(test_svd_scl)

In [35]:
pred

array([ 3,  9, 13, ..., 13,  3,  6])

In [36]:
pred_inverse=lbl_enc.inverse_transform(pred)

In [38]:
pred_inverse

array([   100,  10000, 100000, ..., 100000,    100,   1000])

In [40]:
pred2=pred_inverse.copy()

In [41]:
pred2

array([   100,  10000, 100000, ..., 100000,    100,   1000])

In [97]:
label=[]

In [98]:
for i in range(len(pred2)):
    b=str(pred2[i])
    
    if len(b)==5:
        a='0'+b
        label.append(a)
    elif len(b)==4:
        a='00'+b
        label.append(a)
    elif len(b)==3:
        a='000'+b
        label.append(a)
    elif len(b)==2:
        a='0000'+b
        label.append(a)
    elif len(b)==1:
        a='00000'+b
        label.append(a)
    elif len(b)==6:
        label.append(b)

In [101]:
pred2[:3]

array([   100,  10000, 100000])

In [102]:
label[:3]

['000100', '010000', '100000']

In [103]:
sub1=sample_submission.copy()

In [104]:
sub1.head(2)

Unnamed: 0,ID,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,20973,0,0,0,0,0,0
1,20974,0,0,0,0,0,0


In [105]:
len(label),len(sub1)

(8989, 8989)

In [83]:
for i in range(sub1.shape[0]):
    sub1['Computer Science'].iloc[i]=int(label[i][0])
    sub1['Physics'].iloc[i]=int(label[i][1])
    sub1['Mathematics'].iloc[i]=int(label[i][2])
    sub1['Statistics'].iloc[i]=int(label[i][3])
    sub1['Quantitative Biology'].iloc[i]=int(label[i][4])
    sub1['Quantitative Finance'].iloc[i]=int(label[i][5])

In [86]:
sub1['Computer Science'].value_counts()

0    8989
Name: Computer Science, dtype: int64

In [84]:
sub1

Unnamed: 0,ID,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,20973,0,0,0,1,0,0
1,20974,0,1,0,0,0,0
2,20975,0,1,0,0,0,0
3,20976,0,1,0,0,0,0
4,20977,0,1,0,0,0,0
5,20978,0,0,1,1,0,0
6,20979,0,0,1,1,0,0
7,20980,0,0,1,1,0,0
8,20981,0,0,1,1,0,0
9,20982,0,1,0,0,0,0


In [None]:
sub1.to_csv('sub1.csv',index=False)