In [1]:
from collections import Counter
import numpy as np
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
df = pd.read_csv('/media/1t/protein/data/train.csv')
X, y = df.Id.values, df.Target.str.split().values
binarizer = MultiLabelBinarizer().fit(y)
y_bin = binarizer.transform(y).astype(np.float32)
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=4242)
train_index, test_index = list(msss.split(X, y_bin))[0]
X_folds, X_holdout = X[train_index], X[test_index]
y_folds, y_holdout = y[train_index], y[test_index]

In [3]:
# train
Counter([x for l in y_folds for x in l]).most_common()

[('0', 10308),
 ('25', 6582),
 ('21', 3022),
 ('2', 2897),
 ('23', 2372),
 ('7', 2258),
 ('5', 2010),
 ('4', 1486),
 ('3', 1249),
 ('19', 1186),
 ('1', 1003),
 ('11', 874),
 ('14', 853),
 ('6', 806),
 ('18', 722),
 ('22', 642),
 ('12', 550),
 ('13', 430),
 ('16', 424),
 ('26', 262),
 ('24', 258),
 ('17', 168),
 ('20', 138),
 ('8', 42),
 ('9', 36),
 ('10', 22),
 ('15', 17),
 ('27', 9)]

In [4]:
#holdout
Counter([x for l in y_holdout for x in l]).most_common()

[('0', 2577),
 ('25', 1646),
 ('21', 755),
 ('2', 724),
 ('23', 593),
 ('7', 564),
 ('5', 503),
 ('4', 372),
 ('3', 312),
 ('19', 296),
 ('1', 251),
 ('11', 219),
 ('14', 213),
 ('6', 202),
 ('18', 180),
 ('22', 160),
 ('12', 138),
 ('13', 107),
 ('16', 106),
 ('26', 66),
 ('24', 64),
 ('17', 42),
 ('20', 34),
 ('8', 11),
 ('9', 9),
 ('10', 6),
 ('15', 4),
 ('27', 2)]

In [5]:
pd.DataFrame({
    'Id': X_holdout,
    'Target': pd.Series(y_holdout).str.join(' '),
}).to_csv('../folds/holdout.csv', index=False)

In [6]:
external = pd.read_csv('/media/1t/protein/data/HPAv18RBGY_wodpl.csv')
X_folds = np.concatenate([X_folds, external.Id.values])
y_folds = np.concatenate([y_folds, external.Target.str.split().values])
y_folds_bin = binarizer.transform(y_folds).astype(np.float32)
msss = MultilabelStratifiedShuffleSplit(n_splits=5, random_state=4242)
for i, (train_index, test_index) in enumerate(msss.split(X_folds, y_folds_bin)):
    X_train, X_test = X_folds[train_index], X_folds[test_index]
    y_train, y_test = y_folds[train_index], y_folds[test_index]
    
    pd.DataFrame({
        'Id': X_train,
        'Target': pd.Series(y_train).str.join(' '),
    }).to_csv(f'../folds/fold{i + 1}_train.csv', index=False)
    
    pd.DataFrame({
        'Id': X_test,
        'Target': pd.Series(y_test).str.join(' '),
    }).to_csv(f'../folds/fold{i + 1}_val.csv', index=False)        

In [7]:
fold = pd.read_csv('../folds/fold1_train.csv')
Counter([x for l in fold.Target.str.split() for x in l]).most_common()

[('0', 34543),
 ('25', 32148),
 ('21', 11749),
 ('2', 9132),
 ('23', 8777),
 ('7', 7957),
 ('5', 4891),
 ('4', 4282),
 ('6', 3171),
 ('19', 3038),
 ('3', 2715),
 ('1', 2539),
 ('22', 2312),
 ('14', 2231),
 ('12', 1885),
 ('11', 1777),
 ('18', 1542),
 ('13', 1216),
 ('16', 1066),
 ('26', 576),
 ('20', 364),
 ('17', 364),
 ('24', 328),
 ('8', 185),
 ('9', 169),
 ('10', 158),
 ('27', 112),
 ('15', 53)]

In [8]:
fold = pd.read_csv('../folds/fold1_val.csv')
Counter([x for l in fold.Target.str.split() for x in l]).most_common()

[('0', 3838),
 ('25', 3572),
 ('21', 1305),
 ('2', 1015),
 ('23', 975),
 ('7', 884),
 ('5', 544),
 ('4', 476),
 ('6', 352),
 ('19', 338),
 ('3', 302),
 ('1', 282),
 ('22', 257),
 ('14', 248),
 ('12', 210),
 ('11', 198),
 ('18', 171),
 ('13', 135),
 ('16', 118),
 ('26', 64),
 ('17', 40),
 ('20', 40),
 ('24', 36),
 ('8', 21),
 ('9', 19),
 ('10', 18),
 ('27', 13),
 ('15', 6)]