In [1]:
!pip install scikit-multilearn



In [3]:
# Basic
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

# Pre-processing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import IterativeStratification

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.adapt import MLkNN
from skmultilearn.ensemble import LabelSpacePartitioningClassifier

# Optimization
from sklearn.model_selection import GridSearchCV

# Evaluation
from sklearn.metrics import hamming_loss
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score

In [9]:
# Loading X data
train_features = pd.read_csv("./data/train_features.csv")
test_features = pd.read_csv("./data/test_features.csv")

In [10]:
# Loading Y data
train_targets_sc = pd.read_csv("./data/train_targets_scored.csv")
train_targets_ns = pd.read_csv("./data/train_targets_nonscored.csv")

In [11]:
sample_submission = pd.read_csv("./data/sample_submission.csv")

In [12]:
X = train_features.drop('sig_id', axis = 1)
X = pd.get_dummies(X)

In [13]:
y = train_targets_sc.drop('sig_id', axis = 1)

In [14]:
stratifier = IterativeStratification(n_splits=2, order=2, sample_distribution_per_fold=[0.25, 0.75])
train_indexes, test_indexes = next(stratifier.split(X, y))

X_train, y_train = X.loc[train_indexes], y.loc[train_indexes]
X_val, y_val = X.loc[test_indexes], y.loc[test_indexes]

In [36]:
knn = MLkNN(k = 5)

In [37]:
%%time
knn.fit(X_train.values, y_train.values)



CPU times: user 7min 43s, sys: 559 ms, total: 7min 44s
Wall time: 7min 44s


MLkNN(k=5)

In [38]:
y_pred = knn.predict(X_val)

In [40]:
y_prob = knn.predict_proba(X_val)

In [41]:
y_prob_array = y_prob.toarray()

In [42]:
logging_score = log_loss(y_val.values, y_prob_array)

In [43]:
print(f'Log-loss score: {logging_score}')

Log-loss score: 3.518411840190982


In [44]:
with open('./data/models/mlknn_model.pkl','wb') as outfile:
  pickle.dump(knn, outfile)

In [48]:
clusterer = pickle.load(open('./data/clusterer.pkl', 'rb'))

In [49]:
lab_space = LabelSpacePartitioningClassifier(
    classifier = BinaryRelevance(
    classifier = RandomForestClassifier(),
    require_dense = [False, True]
    ),
    clusterer  = clusterer
)

In [None]:
%%time
lab_space.fit(X_train,y_train)

In [None]:
y_pred = lab_space.predict(X_val)

In [None]:
with open('./data/models/lab_space.pkl','wb') as outfile:
  pickle.dump(lab_space, outfile)