In [9]:
import pandas as pd
import scipy as sp

from sklearn.metrics import classification_report


In [2]:
data = pd.read_json("data/augmented/merged_data.json")

In [3]:
cols=data.columns
labels = [x for x in cols if x.startswith('a')][5:]
features = [x for x in cols if not x.startswith('a') and x != "expression_likely"][5:] # indexing beginning of features

In [4]:
train = data[data.dataset.isin(["train", "validation"])]
test = data[data.dataset == "test"]

X_train, y_train = train[features].astype(int), train[labels].astype(int)
X_test, y_test = test[features].astype(int), test[labels].astype(int)

## Binary Relevance

In [5]:
# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test,predictions)

0.012375

## Classifier chains

In [6]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.naive_bayes import GaussianNB

# initialize classifier chains multi-label classifier
# with a gaussian naive bayes base classifier
classifier = ClassifierChain(GaussianNB())

# train
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

accuracy_score(y_test,predictions)
# absence of label correlation

0.041125

## Label powerset

In [7]:
# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB

# initialize Label Powerset multi-label classifier
# with a gaussian naive bayes base classifier
classifier = LabelPowerset(GaussianNB())

# train
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

accuracy_score(y_test,predictions)

0.046

## Adapted algorithm

In [10]:
from skmultilearn.adapt import MLkNN
from sklearn.metrics import classification_report

classifier = MLkNN(k=20)

# train
classifier.fit(sp.sparse.csr_matrix(X_train.values), sp.sparse.csr_matrix(y_train))

# predict
predictions = classifier.predict(sp.sparse.csr_matrix(X_test))

accuracy_score(sp.sparse.csr_matrix(y_test),predictions)



0.28075

## Binary Relevance kNN

In [11]:
from skmultilearn.adapt import BRkNNaClassifier

classifier = BRkNNaClassifier(k=7500)

# train
classifier.fit(sp.sparse.csr_matrix(X_train.values), sp.sparse.csr_matrix(y_train))

# predict
predictions = classifier.predict(sp.sparse.csr_matrix(X_test))

print(accuracy_score(sp.sparse.csr_matrix(y_test), predictions))



0.343125
