In [1]:
data = {}

path_to_data = r'all_time-series/ts_cut'

import os
import numpy as np
import re

parser_regex = r'(ihb|china|rmet)_(open|close)_strategy-(\d)_(GSR|noGSR).npy'
extractor = re.compile(parser_regex)

items = {atlas_name: {dataset_name: []
                      for dataset_name in ['china', 'ihb', 'rmet']}
         for atlas_name in ['HCPex', 'Brainnetome', 'AAL', 'Schaefer200']}

for root, dirs, files in os.walk(path_to_data):
    for file in files:
        atlas = root.split(os.sep)[1]
        source = root.split(os.sep)[2]
        items[atlas][source].append(np.load(os.path.join(root, file)))

In [2]:
from random import randint, seed, shuffle

seed(42)

In [3]:
import pandas as pd
import numpy as np
from itertools import combinations, product

at = 'Brainnetome'
db = 'rmet'
q_people = 63
brain_parts = 246
q_pictures = 120
train_percentage = 0.7

In [4]:
i = 0
data_dif = np.empty((338100, brain_parts + 1))
for person1, person2 in combinations(range(int(q_people * train_percentage)), r=2):
    for teg1, teg2 in combinations(range(len(items[at][db])), r=2):
        a = items[at][db][teg1][person1][randint(0, q_pictures - 1)]
        b = items[at][db][teg2][person2][randint(0, q_pictures - 1)]
        data_dif[i] = np.hstack([a - b, np.array([0])])
        i += 1

In [5]:
i = 0
data_same = np.empty((432000, brain_parts + 1))
for person in range(int(q_people * train_percentage)):
    for teg1, teg2 in product(range(len(items[at][db])), repeat=2):
        idxs = list(range(q_pictures))
        shuffle(idxs)
        for idx in range(15):
            a = items[at][db][teg1][person][idxs[idx]]
            b = items[at][db][teg2][person][idxs[-idx - 1]]
            data_same[i] = np.hstack([a - b, np.array([1])])
            i += 1

In [6]:
data_same = data_same[:data_dif.shape[0]]
data = np.vstack([data_dif, data_same])

In [7]:
df = pd.DataFrame(data)
df = df.sample(frac=1).reset_index(drop=True)

In [8]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,237,238,239,240,241,242,243,244,245,246
0,-0.20947,0.37128,-1.131895,-0.163956,-0.958288,0.627653,-0.230389,0.307526,-0.609125,-0.29937,...,3.507654,1.321494,2.024788,2.442091,3.145879,0.640271,0.961983,1.283943,-0.388224,1.0
1,1.030403,-0.326288,0.048898,-1.406976,-1.519416,-1.492172,-0.811243,-0.11631,-0.947651,-0.565061,...,3.360684,1.007489,-0.100263,-0.988485,0.293458,0.615108,0.013946,0.38147,0.300772,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.357369,1.254491,1.65191,0.676253,2.547613,1.108902,0.084267,0.014575,2.598305,1.971964,...,0.128753,-0.102458,-2.319209,0.968045,-1.537404,-0.472319,0.672754,0.435855,-0.638342,0.0
4,0.687219,-0.018846,1.227181,-0.23263,2.385656,0.83169,-0.862561,-0.633593,-1.375454,0.114808,...,3.254623,-1.136756,-0.002639,1.750196,1.029359,1.102148,1.353855,2.078236,-2.86611,1.0


In [9]:
df.columns

RangeIndex(start=0, stop=247, step=1)

In [13]:
i = 0
other_data_dif = np.empty((42228, brain_parts + 1))
for person1, person2 in combinations(range(int(q_people * (1 - train_percentage))), r=2):
    for teg1, teg2 in combinations(range(len(items[at][db])), r=2):
        a = items[at][db][teg1][person1][randint(0, q_pictures - 1)]
        b = items[at][db][teg2][person2][randint(0, q_pictures - 1)]
        other_data_dif[i] = np.hstack([a - b, np.array([0])])
        i += 1

In [16]:
i = 0
other_data_same = np.empty((155520, brain_parts + 1))
for person in range(int(q_people * (1 - train_percentage))):
    for teg1, teg2 in product(range(len(items[at][db])), repeat=2):
        idxs = list(range(q_pictures))
        shuffle(idxs)
        for idx in range(15):
            a = items[at][db][teg1][person][idxs[idx]]
            b = items[at][db][teg2][person][idxs[-idx - 1]]
            other_data_same[i] = np.hstack([a - b, np.array([1])])
            i += 1

In [17]:
other_data = np.vstack([other_data_dif, other_data_same])
other_df = pd.DataFrame(other_data)
other_df = other_df.sample(frac=1).reset_index(drop=True)

In [18]:
other_df.shape, df.shape

((197748, 247), (676200, 247))

In [19]:
len(other_data_same)

155520

In [20]:
y_train = df[246]
X_train = df.drop(246, axis=1)

y_test = other_df[246]
X_test = other_df.drop(246, axis=1)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

model = LogisticRegression()
model.fit(X_train, y_train)

pred = model.predict(X_test)

In [22]:
accuracy_score(y_test, pred), f1_score(y_test, pred), precision_score(y_test, pred), recall_score(y_test, pred)

(0.5004652385864838,
 0.6120719447062519,
 0.7862086359967716,
 0.5010866769547325)

'-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'

In [27]:
from catboost import CatBoostClassifier

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=200,
                           learning_rate=0.1,
                           depth=12)
# Fit model
model.fit(X_train, y_train)

0:	learn: 0.6470265	total: 1.93s	remaining: 6m 23s
1:	learn: 0.6286528	total: 3.96s	remaining: 6m 32s
2:	learn: 0.6202798	total: 5.79s	remaining: 6m 20s
3:	learn: 0.6153925	total: 7.59s	remaining: 6m 11s
4:	learn: 0.6126534	total: 9.31s	remaining: 6m 3s
5:	learn: 0.6100288	total: 11s	remaining: 5m 55s
6:	learn: 0.6082881	total: 12.8s	remaining: 5m 52s
7:	learn: 0.6069450	total: 14.7s	remaining: 5m 52s
8:	learn: 0.6058095	total: 16.3s	remaining: 5m 45s
9:	learn: 0.6048460	total: 17.6s	remaining: 5m 34s
10:	learn: 0.6041461	total: 19.1s	remaining: 5m 27s
11:	learn: 0.6035004	total: 20.5s	remaining: 5m 21s
12:	learn: 0.6018118	total: 21.9s	remaining: 5m 14s
13:	learn: 0.6012966	total: 23.3s	remaining: 5m 9s
14:	learn: 0.6005140	total: 24.9s	remaining: 5m 7s
15:	learn: 0.5999457	total: 26.3s	remaining: 5m 2s
16:	learn: 0.5995054	total: 27.7s	remaining: 4m 58s
17:	learn: 0.5989293	total: 29.2s	remaining: 4m 55s
18:	learn: 0.5985284	total: 30.5s	remaining: 4m 50s
19:	learn: 0.5980210	total: 

<catboost.core.CatBoostClassifier at 0x1ffb0ef4c20>

In [28]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

pred = model.predict(X_test)
accuracy_score(y_test, pred), f1_score(y_test, pred), precision_score(y_test, pred), recall_score(y_test, pred)

(0.7349556000566377, 0.8378251263993663, 0.8074876835538166, 0.870531121399177)