In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score

In this dataset there is: 8835 record 2763 different input keyword 657 different output classes

In [2]:
data = pd.read_csv('Complete_Dataset_2_FL-csv.csv', sep=",")
data.head()

Unnamed: 0,Keywords,Classification
0,"Classification systems,Health law,Health care,...",Life and medical sciences
1,"Information retrieval systems,Web software,Alg...",World Wide Web
2,"American culture,Social information processing...",Information systems applications;Information r...
3,"Data management,Software,Types of databases,Da...",Data management systems;Theory and algorithms ...
4,"Business terms,Temporal rates,Scientific metho...",Information systems applications


In [3]:
len(data)

7655

In [4]:
# null values?
data.isnull().any()

Keywords           True
Classification    False
dtype: bool

In [5]:
data.Keywords.isnull().sum()

35

In [6]:
# a null example:
for i in range(100):
    if type(data.Keywords[i]) == float:
        print i

40
82


In [7]:
data[39:42]

Unnamed: 0,Keywords,Classification
39,"Information retrieval systems,Web software,Alg...",Information retrieval
40,,Information retrieval
41,"Data management,Software,Databases,Measurement...",Data management systems;Theory and algorithms ...


In [8]:
# drop null rows
data = data.dropna()
len(data)

7620

In [82]:
# how many distinct combinations of labels?
len(data.Classification.value_counts())

3411

In [85]:
# how many of those combinations appear only once?
sum(data.Classification.value_counts() == 1)

2912

In [10]:
# split keywords and labels
X = data.Keywords
X = X.apply(lambda x: x.split(','))
y = data.Classification
y = y.apply(lambda x: x.split(';'))

In [11]:
X.head()

0    [Classification systems, Health law, Health ca...
1    [Information retrieval systems, Web software, ...
2    [American culture, Social information processi...
3    [Data management, Software, Types of databases...
4    [Business terms, Temporal rates, Scientific me...
Name: Keywords, dtype: object

In [12]:
# convert keywords and labels into matrices of binary values
X_binarize = MultiLabelBinarizer().fit_transform(X)
y_binarize = MultiLabelBinarizer().fit_transform(y)

In [87]:
X_binarize

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ..., 
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [13]:
print X_binarize.shape
print y_binarize.shape

(7620, 2641)
(7620, 541)


In [14]:
# test and training split
X_train, X_test, y_train, y_test = train_test_split(X_binarize, y_binarize, test_size=0.25, random_state=42)

In [17]:
from sklearn.svm import LinearSVC
svm = OneVsRestClassifier(LinearSVC())
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
f1_score(y_test, svm_pred, average = 'micro')

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


0.083100673345376894

These warnings mean that on this split of the data, some of the prediction labels we are training to predict never actually occur in X_train and y_train, and this is going to be the case for (almost) any split on the data with any algorithm. The algorithm nevertheless still works

In [16]:
# knn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
f1_score(y_test, knn_pred, average = 'micro')

0.083299798792756535

In [18]:
# decision tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
f1_score(y_test, dt_pred, average = 'micro')

0.10830769230769231

In [20]:
# random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
f1_score(y_test, rf_pred, average = 'micro')

0.055464256368118324

In [29]:
# the algorithms above already support multilabel data. Adaboost doesn't so we have to use onevsrestclassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import AdaBoostClassifier
ab = OneVsRestClassifier(AdaBoostClassifier())
ab.fit(X_train, y_train)
ab_pred = ab.predict(X_test)
f1_score(y_test, ab_pred, average = 'micro')

0.051962616822429905

In [32]:
# sklearn neural network classifier
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(activation='relu', solver ='adam', random_state=42, 
                  hidden_layer_sizes=(175,), max_iter=200,)
nn.fit(X_train, y_train)
nn_pred = nn.predict(X_test)
f1_score(y_test, nn_pred, average = 'micro')

0.100877893056664

ToDo
- non-stratified CV, check variance
- label encoder, hashing trick
- tune neural network, try tensorflow?
- proper evaluation metric, predict_proba?
- adaboost, gradientboosting, xgboost don't support multilabel