# André Fonseca
# Trabalho de conclusão do Data Science - Awari
# Análise derivado do LISH-MOA
## https://www.kaggle.com/c/lish-moa/overview

# 1. Import modules

In [1]:
!pip install scikit-multilearn

Collecting scikit-multilearn
[?25l  Downloading https://files.pythonhosted.org/packages/bb/1f/e6ff649c72a1cdf2c7a1d31eb21705110ce1c5d3e7e26b2cc300e1637272/scikit_multilearn-0.2.0-py3-none-any.whl (89kB)
[K     |████████████████████████████████| 92kB 603kB/s eta 0:00:01
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [2]:
# Basic
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline

# Dump
import pickle

# Pre-processing
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import iterative_stratification as stratify

# Models
from sklearn.neighbors import KNeighborsClassifier
from skmultilearn.adapt import MLkNN
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier

# Evaluation
from sklearn.metrics import hamming_loss
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 2. Loading dataset

In [3]:
# Loading X data
train_features = pd.read_csv("./data/train_features.csv")
test_features = pd.read_csv("./data/test_features.csv")

In [4]:
# Loading Y data
train_targets_sc = pd.read_csv("./data/train_targets_scored.csv")
train_targets_ns = pd.read_csv("./data/train_targets_nonscored.csv")

In [5]:
sample_submission = pd.read_csv("./data/sample_submission.csv")

## 2.1. Preparing dataset to Machine Learning

In [6]:
X = train_features.drop('sig_id', axis = 1)
X = pd.get_dummies(X)

In [7]:
y = train_targets_sc.drop('sig_id', axis = 1)

## 2.2. Building a validation test

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42) # stratify = y

# 3. Baseline and advance models

## 3.1. Logistic Regression - One-vs-Rest

In [16]:
moa_category, moa_logistic, moa_accuracy = y_train.columns, [], []

In [18]:
LogReg_pipeline = Pipeline([('classifier', OneVsRestClassifier(LogisticRegression(solver = 'sag'), n_jobs = -1)), ])

In [None]:
for category in moa_category:
  try:
    LogReg_pipeline.fit(X_train, y_train[category])
  except:
    print("Error!")
  
  y_pred = LogReg_pipeline.predict(X_val)
  score = accuracy_score(y_val[category], y_pred)

  moa_accuracy.append(score)
  moa_category.append(category)

In [None]:
log_one_vs_rest = pd.DataFrame({'MoA': moa_category, 'accuracy': moa_accuracy})

In [None]:
log_one_vs_rest.head(10)

In [None]:
log_one_vs_rest.to_csv("./data/log_one_vs_rest.csv", index = False)

## 3.2. LabelPowerSet

Label Powerset is a problem transformation approach to multi-label classification that transforms a multi-label problem to a multi-class problem with 1 multi-class classifier trained on all unique label combinations found in the training data.

In [9]:
power = LabelPowerset(LogisticRegression())

In [10]:
%%time
power.fit(X_train, y_train)

CPU times: user 4min 42s, sys: 2min 32s, total: 7min 14s
Wall time: 1min 11s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LabelPowerset(classifier=LogisticRegression(), require_dense=[True, True])

In [11]:
y_pred = power.predict(X_val)

In [12]:
y_prob = power.predict_proba(X_val)

In [13]:
y_prob_array = y_prob.toarray()

In [14]:
accuracy_score = accuracy_score(y_val.values, y_pred)

In [15]:
print(f'Accuracy score: {accuracy_score}')

Accuracy score: 0.4170305676855895


In [16]:
logging_score = log_loss(y_val.values, y_prob_array)

In [17]:
print(f'Log-loss score: {logging_score}')

Log-loss score: 2.8554488823155726


In [18]:
with open('./data/power_model.pkl','wb') as outfile:
  pickle.dump(power, outfile)

## 3.3. Adapted Algorithm

In [19]:
knn = MLkNN(k = 50)

In [20]:
%%time
knn.fit(X_train.values, y_train.values)



CPU times: user 9min 6s, sys: 697 ms, total: 9min 7s
Wall time: 9min 8s


MLkNN(k=50)

In [21]:
y_pred = knn.predict(X_val)

In [22]:
y_prob = knn.predict_proba(X_val)

In [23]:
y_prob_array = y_prob.toarray()

In [24]:
logging_score = log_loss(y_val.values, y_prob_array)

In [25]:
print(f'Log-loss score: {logging_score}')

Log-loss score: 3.8471827277345967


In [34]:
accuracy_score = accuracy_score(y_val.values, y_pred)

TypeError: 'numpy.float64' object is not callable

In [35]:
print(f'Accuracy score: {accuracy_score}')

Accuracy score: 0.4170305676855895


In [36]:
with open('./data/mlknn_model.pkl','wb') as outfile:
  pickle.dump(knn, outfile)

## 3.5. Random Forest

In [38]:
rfc = RandomForestClassifier()

In [39]:
%%time
rfc.fit(X_train, y_train)

CPU times: user 5h 45min 35s, sys: 6.84 s, total: 5h 45min 42s
Wall time: 5h 45min 50s


RandomForestClassifier()

In [40]:
y_pred = rfc.predict(X_val)

In [41]:
y_pred

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [42]:
accuracy_score = accuracy_score(y_val.values, y_pred)

TypeError: 'numpy.float64' object is not callable

In [43]:
with open('./data/rfc_model.pkl','wb') as outfile:
  pickle.dump(rfc, outfile)

## 3.4

In [None]:
# https://skml.readthedocs.io/en/latest/auto_examples/example_lp.html
# https://xang1234.github.io/multi-label/