# André Fonseca
# Trabalho de conclusão do Data Science - Awari
# Análise derivada do LISH-MoA
## https://www.kaggle.com/c/lish-moa/overview

# 1. Import modules

In [1]:
!pip install scikit-multilearn

Collecting scikit-multilearn
[?25l  Downloading https://files.pythonhosted.org/packages/bb/1f/e6ff649c72a1cdf2c7a1d31eb21705110ce1c5d3e7e26b2cc300e1637272/scikit_multilearn-0.2.0-py3-none-any.whl (89kB)
[K     |███▊                            | 10kB 15.9MB/s eta 0:00:01[K     |███████▍                        | 20kB 15.9MB/s eta 0:00:01[K     |███████████                     | 30kB 9.9MB/s eta 0:00:01[K     |██████████████▊                 | 40kB 8.3MB/s eta 0:00:01[K     |██████████████████▍             | 51kB 4.5MB/s eta 0:00:01[K     |██████████████████████          | 61kB 4.8MB/s eta 0:00:01[K     |█████████████████████████▊      | 71kB 5.1MB/s eta 0:00:01[K     |█████████████████████████████▍  | 81kB 5.4MB/s eta 0:00:01[K     |████████████████████████████████| 92kB 3.9MB/s 
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [15]:
# Basic
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

# Pre-processing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import IterativeStratification

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.adapt import MLkNN
from sklearn.naive_bayes import GaussianNB
from skmultilearn.ensemble import LabelSpacePartitioningClassifier

# Optimization
from sklearn.model_selection import GridSearchCV

# Evaluation
from sklearn.metrics import hamming_loss
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score

#### Only if you are using Google Colab

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/My Drive/Awari

/content/drive/My Drive/Awari


# 2. Loading dataset

In [5]:
# Loading X data
train_features = pd.read_csv("./data/train_features.csv")
test_features = pd.read_csv("./data/test_features.csv")

In [6]:
# Loading Y data
train_targets_sc = pd.read_csv("./data/train_targets_scored.csv")
train_targets_ns = pd.read_csv("./data/train_targets_nonscored.csv")

In [7]:
sample_submission = pd.read_csv("./data/sample_submission.csv")

## 2.1. Preparing dataset to Machine Learning

In [8]:
X = train_features.drop('sig_id', axis = 1)
X = pd.get_dummies(X)

In [9]:
y = train_targets_sc.drop('sig_id', axis = 1)

## 2.2. Building a validation test

In [10]:
stratifier = IterativeStratification(n_splits = 2, order = 2, sample_distribution_per_fold = [0.25, 0.75])
train_indexes, test_indexes = next(stratifier.split(X, y))

X_train, y_train = X.loc[train_indexes], y.loc[train_indexes]
X_val, y_val = X.loc[test_indexes], y.loc[test_indexes]

In [11]:
X_train.shape

(17860, 877)

In [12]:
X_val.shape

(5954, 877)

## 2.3. Loading pre-calculated models

In [56]:
power = pickle.load(open('./data/models/power_model.pkl', 'rb'))

In [None]:
br = pickle.load(open('./data/models/binary_forest_model.pkl', 'rb'))

In [None]:
knn = pickle.load(open('./data/models/mlknn_model.pkl', 'rb'))

In [None]:
knn_grid = pickle.load(open('./data/models/knn_cv_model.pkl', 'rb'))

In [None]:
lab_space = pickle.load(open('./data/models/lab_space.pkl', 'rb'))

In [None]:
lab_grid = pickle.load(open('./data/models/lab_cv_model.pkl', 'rb'))

# 3. Multi-labels models

### Problem Transformation approaches
* One-vs-Rest - Logistic Regression
* LabelPowerSet - Logistic Regression
* BinaryRelevance - Random Forest

### Algorithm Adaptation approaches
* multi-label adapted kNN
* multi-label adapted kNN + Cross-Validation

### Ensembles of Classifiers
* LabelSpacePartitioningClassifier
* LabelSpacePartitioningClassifier + Cross-Validation

## 3.1. One-vs-Rest

In [None]:
moa_category, moa_names, moa_accuracy = y_train.columns, [], []

In [None]:
log_pipeline = Pipeline([('classifier', OneVsRestClassifier(
    LogisticRegression(solver = 'sag'), n_jobs = -1)), ])

In [None]:
%%time

for category in moa_category:
  try:
    log_pipeline.fit(X_train, y_train[category])
  except:
    print("Error!")
  
  y_pred = log_pipeline.predict(X_val)
  score = accuracy_score(y_val[category], y_pred)

  moa_accuracy.append(score)
  moa_names.append(category)

CPU times: user 1min 12s, sys: 1min 20s, total: 2min 33s
Wall time: 40min 10s


In [None]:
log_one_vs_rest = pd.DataFrame({'MoA': moa_names, 'accuracy': moa_accuracy})

In [None]:
log_one_vs_rest.head(10)

Unnamed: 0,MoA,accuracy
0,5-alpha_reductase_inhibitor,0.999328
1,11-beta-hsd1_inhibitor,0.999328
2,acat_inhibitor,0.998992
3,acetylcholine_receptor_agonist,0.992106
4,acetylcholine_receptor_antagonist,0.987235
5,acetylcholinesterase_inhibitor,0.996977
6,adenosine_receptor_agonist,0.997649
7,adenosine_receptor_antagonist,0.995969
8,adenylyl_cyclase_activator,0.999496
9,adrenergic_receptor_agonist,0.988579


In [None]:
log_one_vs_rest.to_csv("./data/models/log_one_vs_rest.csv", index = False)

## 3.2. LabelPowerSet

In [53]:
power = LabelPowerset(LogisticRegression(), 
                      require_dense = [False, True])

In [None]:
%%time
power.fit(X_train, y_train)

In [57]:
y_pred = power.predict(X_val)

In [None]:
y_prob = power.predict_proba(X_val)

In [None]:
y_prob_array = y_prob.toarray()

In [None]:
accuracy_score = accuracy_score(y_val.values, y_pred)

In [None]:
print(f'Accuracy score: {accuracy_score}')

In [None]:
f1_score = f1_score(y_val.values, y_pred, average = 'micro')

In [None]:
print(f'F1-score: {accuracy_score}')

In [None]:
logging_score = log_loss(y_val.values, y_prob_array)

In [None]:
print(f'Log-loss score: {logging_score}')

Log-loss score: 2.3457120387845443


In [None]:
hamming_loss = hamming_loss(y_val.values, y_pred)

In [None]:
print(f'Hamming-loss score: {hamming_loss}')

In [None]:
with open('./data/models/power_model.pkl','wb') as outfile:
  pickle.dump(power, outfile)

## 3.3. BinaryRelevance

In [None]:
br = BinaryRelevance(
    classifier = RandomForestClassifier(),
    require_dense = [False, True]
)

In [None]:
%%time
br.fit(X_train, y_train)

CPU times: user 3h 17min 49s, sys: 2.84 s, total: 3h 17min 51s
Wall time: 3h 18min 59s


BinaryRelevance(classifier=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  max_samples=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100, n_jobs=None,
                                                  oo

In [None]:
y_pred = br.predict(X_val)

In [None]:
y_prob = br.predict_proba(X_val)

In [None]:
y_prob_array = y_prob.toarray()

In [None]:
accuracy_score = accuracy_score(y_val.values, y_pred)

In [None]:
print(f'Accuracy score: {accuracy_score}')

In [None]:
f1_score = f1_score(y_val.values, y_pred, average = 'micro')

In [None]:
print(f'F1-score: {accuracy_score}')

In [None]:
logging_score = log_loss(y_val.values, y_prob_array)

In [None]:
print(f'Log-loss score: {logging_score}')

Log-loss score: 7.001000542703964


In [None]:
hamming_loss = hamming_loss(y_val.values, y_pred)

In [None]:
print(f'Hamming-loss score: {hamming_loss}')

In [None]:
with open('./data/models/binary_forest_model.pkl','wb') as outfile:
  pickle.dump(br, outfile)

## 3.4. multi-label adapted kNN

In [None]:
knn = MLkNN(k = 5)

In [None]:
%%time
knn.fit(X_train.values, y_train.values)

CPU times: user 9min 35s, sys: 157 ms, total: 9min 35s
Wall time: 9min 35s


MLkNN(ignore_first_neighbours=0, k=5, s=1.0)

In [None]:
y_pred = knn.predict(X_val)

In [None]:
y_prob = knn.predict_proba(X_val)

In [None]:
y_prob_array = y_prob.toarray()

In [None]:
accuracy_score = accuracy_score(y_val.values, y_pred)

In [None]:
print(f'Accuracy score: {accuracy_score}')

In [None]:
f1_score = f1_score(y_val.values, y_pred, average = 'micro')

In [None]:
print(f'F1-score: {accuracy_score}')

In [None]:
logging_score = log_loss(y_val.values, y_prob_array)

In [None]:
print(f'Log-loss score: {logging_score}')

Log-loss score: 8.653149769241583


In [None]:
hamming_loss = hamming_loss(y_val.values, y_pred)

In [None]:
print(f'Hamming-loss score: {hamming_loss}')

In [None]:
with open('./data/models/mlknn_model.pkl','wb') as outfile:
  pickle.dump(knn, outfile)

## 3.5. multi-label adapted kNN + Cross-Validation

In [None]:
parameters = {'k': range(8, 10), 's': [0.5, 1.0]}
knn_grid = GridSearchCV(MLkNN(), parameters, scoring = 'f1_micro')

In [None]:
%%time
knn_grid.fit(X_train, y_train.values)

In [None]:
print(f'Best parameters: {knn_grid.best_params_}.\nBest score: {knn_grid.best_score_}')

Best parameters: {'k': 9, 's': 1.0}.
Best score: 0.28548788186446944


In [None]:
y_pred = knn_grid.predict(X_val)

In [None]:
y_prob = knn_grid.predict_proba(X_val)

In [None]:
y_prob_array = y_prob.toarray()

In [None]:
accuracy_score = accuracy_score(y_val.values, y_pred)

In [None]:
print(f'Accuracy score: {accuracy_score}')

In [None]:
f1_score = f1_score(y_val.values, y_pred, average = 'micro')

In [None]:
print(f'F1-score: {accuracy_score}')

In [None]:
logging_score = log_loss(y_val.values, y_prob_array)

In [None]:
print(f'Log-loss score: {logging_score}')

Log-loss score: 3.0512107029335276


In [None]:
hamming_loss = hamming_loss(y_val.values, y_pred)

In [None]:
print(f'Hamming-loss score: {hamming_loss}')

In [None]:
with open('./data/models/knn_cv_model.pkl','wb') as outfile:
  pickle.dump(knn_grid, outfile)

## 3.6. LabelSpacePartitioningClassifier

In [13]:
# Load Clusterer object
clusterer = pickle.load(open('./data/clusterer.pkl', 'rb'))

In [None]:
lab_space = LabelSpacePartitioningClassifier(
    classifier = BinaryRelevance(
    classifier = RandomForestClassifier(),
    require_dense = [False, True]
    ),
    clusterer  = clusterer
)

In [None]:
%%time
lab_space.fit(X_train,y_train)

CPU times: user 3h 13min 46s, sys: 2.83 s, total: 3h 13min 49s
Wall time: 3h 13min 57s


LabelSpacePartitioningClassifier(classifier=BinaryRelevance(classifier=RandomForestClassifier(bootstrap=True,
                                                                                              ccp_alpha=0.0,
                                                                                              class_weight=None,
                                                                                              criterion='gini',
                                                                                              max_depth=None,
                                                                                              max_features='auto',
                                                                                              max_leaf_nodes=None,
                                                                                              max_samples=None,
                                                                                              min_impuri

In [None]:
y_pred = lab_space.predict(X_val)

In [None]:
y_prob = lab_space.predict_proba(X_val)

In [None]:
y_prob_array = y_prob.toarray()

In [None]:
accuracy_score = accuracy_score(y_val.values, y_pred)

In [None]:
print(f'Accuracy score: {accuracy_score}')

In [None]:
f1_score = f1_score(y_val.values, y_pred, average = 'micro')

In [None]:
print(f'F1-score: {accuracy_score}')

In [None]:
logging_score = log_loss(y_val.values, y_prob_array)

In [None]:
print(f'Log-loss score: {logging_score}')

Log-loss score: 10.375247354136405


In [None]:
hamming_loss = hamming_loss(y_val.values, y_pred)

In [None]:
print(f'Hamming-loss score: {hamming_loss}')

In [None]:
with open('./data/models/lab_space.pkl','wb') as outfile:
  pickle.dump(lab_space, outfile)

## 3.7. LabelSpacePartitioningClassifier + Cross-Validation

In [None]:
parameters = {
    'classifier': [BinaryRelevance()],
    'classifier__classifier': [RandomForestClassifier()],
    'classifier__classifier__n_estimators': [10, 20, 50],
    'clusterer' : [
        clusterer
    ]
}

In [None]:
lab_grid = GridSearchCV(LabelSpacePartitioningClassifier(), parameters, scoring = 'f1_micro')

In [None]:
%%time
lab_grid.fit(X_train, y_train)

In [None]:
print(f'Best parameters: {lab_grid.best_params_}.\nBest score: {lab_grid.best_score_}')

In [None]:
y_pred = lab_grid.predict(X_val)

In [None]:
y_prob = lab_grid.predict_proba(X_val)

In [None]:
y_prob_array = y_prob.toarray()

In [None]:
accuracy_score = accuracy_score(y_val.values, y_pred)

In [None]:
print(f'Accuracy score: {accuracy_score}')

In [None]:
f1_score = f1_score(y_val.values, y_pred, average = 'micro')

In [None]:
print(f'F1-score: {accuracy_score}')

In [None]:
logging_score = log_loss(y_val.values, y_prob_array)

In [None]:
print(f'Log-loss score: {logging_score}')

Log-loss score: 8.653149769241583


In [None]:
hamming_loss = hamming_loss(y_val.values, y_pred)

In [None]:
print(f'Hamming-loss score: {hamming_loss}')

In [None]:
with open('./data/models/lab_cv_model.pkl','wb') as outfile:
  pickle.dump(lab_grid, outfile)

## 3.8. LabelSpacePartitioningClassifier + GaussianNB

In [59]:
lab_space_nb = LabelSpacePartitioningClassifier(
    classifier = LabelPowerset(classifier = GaussianNB()),
    clusterer = clusterer
)

In [60]:
%%time
lab_space_nb.fit(X_train, y_train)

CPU times: user 39.7 s, sys: 212 ms, total: 39.9 s
Wall time: 40 s


LabelSpacePartitioningClassifier(classifier=LabelPowerset(classifier=GaussianNB(priors=None,
                                                                                var_smoothing=1e-09),
                                                          require_dense=[True,
                                                                         True]),
                                 clusterer=<skmultilearn.cluster.networkx.NetworkXLabelGraphClusterer object at 0x7f53d211d0b8>,
                                 require_dense=[False, False])

In [65]:
y_pred = lab_space_nb.predict(X_val)

In [61]:
y_prob = lab_space_nb.predict_proba(X_val)

In [62]:
y_prob_array = y_prob.toarray()

In [63]:
logging_score = log_loss(y_val.values, y_prob_array)

In [64]:
print(f'Log-loss score: {logging_score}')

Log-loss score: 10.031977781736668


## 3.9. LabelSpacePartitioningClassifier + Regression Logistic

In [69]:
lab_space_lr = LabelSpacePartitioningClassifier(
    classifier = LabelPowerset(classifier = LogisticRegression(max_iter=200), 
                      require_dense = [False, True]),
    clusterer = clusterer
)

In [70]:
%%time
lab_space_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

CPU times: user 48min 26s, sys: 3min 3s, total: 51min 30s
Wall time: 47min 54s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LabelSpacePartitioningClassifier(classifier=LabelPowerset(classifier=LogisticRegression(C=1.0,
                                                                                        class_weight=None,
                                                                                        dual=False,
                                                                                        fit_intercept=True,
                                                                                        intercept_scaling=1,
                                                                                        l1_ratio=None,
                                                                                        max_iter=200,
                                                                                        multi_class='auto',
                                                                                        n_jobs=None,
                                                                   

In [71]:
y_pred = lab_space_lr.predict(X_val)

In [72]:
y_prob = lab_space_lr.predict_proba(X_val)

In [73]:
y_prob_array = y_prob.toarray()

In [74]:
logging_score = log_loss(y_val.values, y_prob_array)

In [75]:
print(f'Log-loss score: {logging_score}')

Log-loss score: 4.485144023153525


In [76]:
with open('./data/models/lab_space_lr.pkl','wb') as outfile:
  pickle.dump(lab_space_lr, outfile)

# 4. References

In [None]:
# https://skml.readthedocs.io/en/latest/auto_examples/example_lp.html
# https://xang1234.github.io/multi-label/
# https://towardsdatascience.com/journey-to-the-center-of-multi-label-classification-384c40229bff
# http://scikit.ml/modelselection.html
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.multilabel_confusion_matrix.html