# André Fonseca
# Trabalho de conclusão do Data Science - Awari
# Análise derivado do LISH-MoA
## https://www.kaggle.com/c/lish-moa/overview

# 1. Import modules

In [None]:
!pip install scikit-multilearn

In [2]:
# Basic
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline

# Dump
import pickle

# Pre-processing
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import iterative_stratification as stratify

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.problem_transform import BinaryRelevance

from skmultilearn.adapt import MLkNN

from skmultilearn.ensemble import LabelSpacePartitioningClassifier


# Optimization
from sklearn.model_selection import GridSearchCV

# Evaluation
from sklearn.metrics import hamming_loss
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [3]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

# 2. Loading dataset

In [4]:
# Loading X data
train_features = pd.read_csv("./data/train_features.csv")
test_features = pd.read_csv("./data/test_features.csv")

In [5]:
# Loading Y data
train_targets_sc = pd.read_csv("./data/train_targets_scored.csv")
train_targets_ns = pd.read_csv("./data/train_targets_nonscored.csv")

In [7]:
sample_submission = pd.read_csv("./data/sample_submission.csv")

## 2.1. Preparing dataset to Machine Learning

In [8]:
X = train_features.drop('sig_id', axis = 1)
X = pd.get_dummies(X)

In [9]:
y = train_targets_sc.drop('sig_id', axis = 1)

## 2.2. Building a validation test

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42) # stratify = y

# 3. Multi-labels models

### Problem Transformation approaches
* One-vs-Rest - Logistic Regression
* LabelPowerSet - Logistic Regression
* BinaryRelevance - Random Forest

### Algorithm Adaptation approaches
* multi-label adapted kNN
* multi-label adapted kNN + Cross-Validation

### Ensembles of Classifiers
* LabelSpacePartitioningClassifier

## 3.1. One-vs-Rest

In [None]:
moa_category, moa_logistic, moa_accuracy = y_train.columns, [], []

In [None]:
%%time
LogReg_pipeline = Pipeline([('classifier', OneVsRestClassifier(LogisticRegression(solver = 'sag'), n_jobs = -1)), ])

In [None]:
for category in moa_category:
  try:
    LogReg_pipeline.fit(X_train, y_train[category])
  except:
    print("Error!")
  
  y_pred = LogReg_pipeline.predict(X_val)
  score = accuracy_score(y_val[category], y_pred)

  moa_accuracy.append(score)
  moa_category.append(category)

In [None]:
log_one_vs_rest = pd.DataFrame({'MoA': moa_category, 'accuracy': moa_accuracy})

In [None]:
log_one_vs_rest.head(10)

In [None]:
log_one_vs_rest.to_csv("./data/log_one_vs_rest.csv", index = False)

## 3.2. LabelPowerSet

Label Powerset is a problem transformation approach to multi-label classification that transforms a multi-label problem to a multi-class problem with 1 multi-class classifier trained on all unique label combinations found in the training data.

In [None]:
power = LabelPowerset(LogisticRegression())

In [None]:
%%time
power.fit(X_train, y_train)

In [None]:
y_pred = power.predict(X_val)

In [None]:
y_prob = power.predict_proba(X_val)

In [None]:
y_prob_array = y_prob.toarray()

In [None]:
accuracy_score = accuracy_score(y_val.values, y_pred)

In [None]:
print(f'Accuracy score: {accuracy_score}')

In [None]:
logging_score = log_loss(y_val.values, y_prob_array)

In [None]:
print(f'Log-loss score: {logging_score}')

In [None]:
hamming_loss = hamming_loss(y_val.values, y_pred)

In [None]:
print(f'Hamming-loss score: {hamming_loss}')

In [None]:
with open('./data/power_model.pkl','wb') as outfile:
  pickle.dump(power, outfile)

## 3.3. BinaryRelevance

In [None]:
br = BinaryRelevance(
    classifier = RandomForestClassifier(),
    require_dense = [False, True]
)

In [None]:
%%time
br.fit(X_train, y_train)

In [None]:
y_pred = br.predict(X_val)

In [None]:
y_prob = br.predict_proba(X_val)

In [None]:
y_prob_array = y_prob.toarray()

In [None]:
accuracy_score = accuracy_score(y_val.values, y_pred)

In [None]:
print(f'Accuracy score: {accuracy_score}')

In [None]:
logging_score = log_loss(y_val.values, y_prob_array)

In [None]:
print(f'Log-loss score: {logging_score}')

In [None]:
hamming_loss = hamming_loss(y_val.values, y_pred)

In [None]:
print(f'Hamming-loss score: {hamming_loss}')

In [None]:
with open('./data/binary_forest_classifier_model.pkl','wb') as outfile:
  pickle.dump(br, outfile)

## 3.4. multi-label adapted kNN

In [None]:
knn = MLkNN(k = 5)

In [None]:
%%time
knn.fit(X_train.values, y_train.values)

In [None]:
y_pred = knn.predict(X_val)

In [None]:
y_prob = knn.predict_proba(X_val)

In [None]:
y_prob_array = y_prob.toarray()

In [None]:
accuracy_score = accuracy_score(y_val.values, y_pred)

In [None]:
print(f'Accuracy score: {accuracy_score}')

In [None]:
logging_score = log_loss(y_val.values, y_prob_array)

In [None]:
print(f'Log-loss score: {logging_score}')

In [None]:
hamming_loss = hamming_loss(y_val.values, y_pred)

In [None]:
print(f'Hamming-loss score: {hamming_loss}')

In [None]:
with open('./data/mlknn_model.pkl','wb') as outfile:
  pickle.dump(knn, outfile)

## 3.5. multi-label adapted kNN + Cross-Validation

In [11]:
parameters = {'k': range(5, 10), 's': [0.5, 0.7, 1.0]}
knn_grid = GridSearchCV(MLkNN(), parameters, scoring = 'log_loss')

In [None]:
%%time
knn_grid.fit(X_train.values, y_train.values)

In [None]:
print(f'best parameters: {knn_grid.best_params_} best score: {knn_grid.best_score_}')

In [None]:
y_pred = knn_grid.predict(X_val)

In [None]:
y_prob = knn_grid.predict_proba(X_val)

In [None]:
y_prob_array = y_prob.toarray()

In [None]:
accuracy_score = accuracy_score(y_val.values, y_pred)

In [None]:
print(f'Accuracy score: {accuracy_score}')

In [None]:
logging_score = log_loss(y_val.values, y_prob_array)

In [None]:
print(f'Log-loss score: {logging_score}')

In [None]:
hamming_loss = hamming_loss(y_val.values, y_pred)

In [None]:
print(f'Hamming-loss score: {hamming_loss}')

In [None]:
with open('./data/knn_classifier_model.pkl','wb') as outfile:
  pickle.dump(classifier, outfile)

## 3.6. LabelSpacePartitioningClassifier

In [None]:
# Load Clusterer object
clusterer = pickle.load()

In [None]:
lab_space = LabelSpacePartitioningClassifier(
    classifier = BinaryRelevance(
    classifier = RandomForestClassifier(),
    require_dense = [False, True]
    ),
    clusterer  = clusterer
)

In [None]:
%%time
lab_space.fit(X_train,y_train)

In [None]:
y_pred = lab_space.predict(X_val)

In [None]:
y_prob = lab_space.predict_proba(X_val)

In [None]:
y_prob_array = y_prob.toarray()

In [None]:
accuracy_score = accuracy_score(y_val.values, y_pred)

In [None]:
print(f'Accuracy score: {accuracy_score}')

In [None]:
logging_score = log_loss(y_val.values, y_prob_array)

In [None]:
print(f'Log-loss score: {logging_score}')

In [None]:
hamming_loss = hamming_loss(y_val.values, y_pred)

In [None]:
print(f'Hamming-loss score: {hamming_loss}')

In [None]:
with open('./data/lab_space.pkl','wb') as outfile:
  pickle.dump(lab_space, outfile)

In [None]:
# https://skml.readthedocs.io/en/latest/auto_examples/example_lp.html
# https://xang1234.github.io/multi-label/