In [1]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold

In [2]:
X, y = make_classification(n_samples=100000, n_features=20, n_informative=10, random_state=123) # creamos los datasamples

In [3]:
X

array([[-0.57119391,  0.26183573,  2.23013121, ..., -0.81820121,
        -3.45493077, -1.51622256],
       [ 0.19835662, -0.04736896, -0.22451846, ...,  4.26853226,
         0.84458305, -0.82457541],
       [-1.49830585,  0.56209058,  1.74125892, ...,  3.54316988,
         0.02096739,  0.49084992],
       ...,
       [-1.55621067,  1.60623044, -1.3471204 , ...,  0.09232235,
        -2.79186036, -0.34130441],
       [-0.12186344, -1.54264823, -0.63079768, ..., -0.41007347,
        -3.64223636,  1.63623005],
       [-1.38758112,  0.61788264,  1.19767415, ...,  4.86230724,
        -0.70890314,  0.49346758]])

In [4]:
y

array([0, 1, 0, ..., 1, 0, 0])

In [5]:
splitter = StratifiedKFold(n_splits=3, shuffle=True, random_state=123) #creamos los cachos que van a ir separados
folds = list(splitter.split(X, y))

In [6]:
folds

[(array([    0,     1,     2, ..., 99995, 99996, 99998]),
  array([    8,    10,    11, ..., 99994, 99997, 99999])),
 (array([    0,     2,     3, ..., 99996, 99997, 99999]),
  array([    1,     5,     7, ..., 99986, 99995, 99998])),
 (array([    1,     5,     7, ..., 99997, 99998, 99999]),
  array([    0,     2,     3, ..., 99987, 99989, 99996]))]

In [7]:
len(folds)

3

In [8]:
len(folds[0])

2

In [9]:
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [10]:
def benchmark_models(X, y, split):
    """
    Helper function to benchmark models
    X : array
    y : array
    split : tuple
     Training and test indices (train_idx, test_idx)
    """
    X_train, y_train = X[split[0],:], y[split[0]]
    X_test, y_test   = X[split[1],:], y[split[1]]
    
    
    model_library = {} #dictionary
    # One candidate model
    model_library["logit"] = LogisticRegression(solver='liblinear')
    # Another candidate model
    model_library["rf"] = RandomForestClassifier(n_estimators=100, min_samples_leaf=20)

    results = {} #dictionary
    for model_name, model in model_library.items():
        # Train the model
        model.fit(X_train, y_train)
        # Make predictions on the test data
        pred_test = model.predict_proba(X_test)[:,1]
        # Evaluate the model
        results[model_name] = roc_auc_score(y_test, pred_test)
    
    return pd.DataFrame(results, index = ["ROC-AUC"])

In [11]:
benchmark_models(X,y,split=folds[0])


Unnamed: 0,logit,rf
ROC-AUC,0.939087,0.987736


In [12]:
import multiprocessing as mp

In [13]:
#pool = mp.Pool(2)
# Python can count the available cores for you in most cases: mp.cpu_count()
pool = mp.Pool(mp.cpu_count()-2)

In [14]:
results = []
def log_result(x):
    results.append(x)

In [15]:
for fold in folds:
    pool.apply_async(benchmark_models, args=(X, y, fold), callback = log_result)

In [16]:
# Close the pool for new tasks
pool.close()
# Wait for all tasks to complete at this point
pool.join()

In [17]:
result = pd.concat(results, axis=0, sort=True)

In [18]:
result

Unnamed: 0,logit,rf
ROC-AUC,0.939087,0.987924
ROC-AUC,0.939554,0.989183
ROC-AUC,0.940639,0.989633


In [19]:
result.index.name = "metric"
result.reset_index()
average = result.groupby(['metric']).mean()

In [20]:
average

Unnamed: 0_level_0,logit,rf
metric,Unnamed: 1_level_1,Unnamed: 2_level_1
ROC-AUC,0.93976,0.988914


In [None]:
python3 10x10valid.py first '../../obtaining_seqs/mini_sample_one_hot_seqs.npy' '../../obtaining_peaks_values/mini_sample_cell_type_array.npy' '../../obtaining_link_seqs_peaks/mini_sample_peak_names.npy' 