In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from sktree import ObliqueRandomForestClassifier, PatchObliqueRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import nibabel as nb
from scipy import ndimage
import scipy.stats as ss
from joblib import Parallel, delayed

In [None]:
df = pd.read_excel('Human.parcellated_thickness.xlsx')

In [None]:
df.head()

In [None]:
df_sex = pd.read_excel('~/data_MRI/subjects_age_sex_data_MRI.xlsx')
df_sex.head()

In [None]:
X1 = []
X2 = []
y_human = []
IDs = set(df['sid'])
ref_IDs = set(df_sex['ID'])

for subject in tqdm(IDs):
    if subject in ref_IDs:
        features = np.array(df[df['sid']==subject]).reshape(-1)[2:]
        gender = list(df_sex[df_sex['ID']==subject]['Sex'])
        sex = int(gender[0]=='FEMALE')
             
        X1.append(list(features[:182]))
        X2.append(list(features[182:]))
        y_human.append(sex)

X1_human = np.array(X1)
X2_human = np.array(X2)

In [None]:
print(X1_human.shape, X2_human.shape)

In [None]:
df = pd.read_excel('Macaque.parcellated_thickness.xlsx')
df.head()

In [None]:
df_sex = pd.read_csv('~/spmmouse_segment/uwmadison.csv')
df_sex.head()

In [None]:
X1 = []
X2 = []
y_monkey = []
IDs = set(df['participant_id'])
ref_IDs = set(df_sex['participant_id'])

for subject in tqdm(IDs):
    if subject in ref_IDs:
        features = np.array(df[df['participant_id']==subject]).reshape(-1)[4:]
        gender = list(df_sex[df_sex['participant_id']==subject]['sex'])
        sex = int(gender[0]=='F')
             
        X1.append(list(features[:182]))
        X2.append(list(features[182:]))
        y_monkey.append(sex)

X1_monkey = np.array(X1)
X2_monkey = np.array(X2)

In [None]:
print(X1_monkey.shape, X2_monkey.shape)

### Try random forest (trained on Humans, tested on monkeys)


In [130]:
reps = 5
accuracy = 0.0

for ii in tqdm(range(reps)):
    x_train, x_test, y_train, y_test = train_test_split(
                    X1_human, y, train_size=0.8, random_state=ii, stratify=y)
    clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
    clf.fit(x_train,y_train)
    accuracy += np.mean(clf.predict(x_test)==y_test)

print('Accuracy is ',accuracy/reps)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:20<00:00,  4.19s/it]

Accuracy is  0.7057276995305164





# Monkey analysis

Unnamed: 0.1,Unnamed: 0,participant_id,age,sex,Markov.1,Markov.2,Markov.3,Markov.4,Markov.5,Markov.6,...,Schaefer217.191,Schaefer217.192,Schaefer217.193,Schaefer217.194,Schaefer217.195,Schaefer217.196,Schaefer217.197,Schaefer217.198,Schaefer217.199,Schaefer217.200
0,0,sub-1001,1.756164,M,3.048436,3.908286,3.221595,3.615675,4.662432,3.707754,...,4.231826,4.908868,4.52273,2.294943,2.853976,3.406234,4.26137,4.131977,3.387978,3.451267
1,1,sub-1002,1.783562,F,3.05352,3.748308,3.043567,3.764927,4.708283,4.060617,...,4.384853,4.849508,4.5895,2.443734,2.855187,3.344378,3.926697,3.477919,2.962553,3.474969
2,2,sub-1003,1.756164,M,3.211265,4.122524,3.374628,4.022762,4.759439,4.182558,...,4.570739,4.921833,4.770724,3.106145,3.094785,3.350355,4.562199,4.212585,3.582792,3.827813
3,3,sub-1004,1.756164,M,3.004275,3.681716,3.227427,3.762712,4.555942,3.984013,...,4.264869,4.935628,4.505048,3.337418,2.892611,3.690076,4.095378,4.328465,3.763171,3.758017
4,4,sub-1005,1.742466,M,2.868796,3.837011,2.997172,3.724171,4.537298,3.816082,...,4.154663,4.817727,4.695378,3.965287,3.219764,3.268439,4.115168,3.889531,3.271547,4.040183


Unnamed: 0,participant_id,age,sex
0,sub-1001,1.756164,M
1,sub-1002,1.783562,F
2,sub-1003,1.756164,M
3,sub-1004,1.756164,M
4,sub-1005,1.742466,M


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 592/592 [00:00<00:00, 1347.32it/s]


(592, 182) (592, 200)


### Try random forest

In [106]:
reps = 5
accuracy = 0.0

for ii in tqdm(range(reps)):
    x_train, x_test, y_train, y_test = train_test_split(
                    X1, y, train_size=0.8, random_state=ii, stratify=y)
    clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
    clf.fit(x_train,y_train)
    accuracy += np.mean(clf.predict(x_test)==y_test)

print('Accuracy is ',accuracy/reps)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:09<00:00,  1.85s/it]

Accuracy is  0.6453781512605041





In [119]:
total_models = 1000
reps = 10000

x_train, x_test, y_train, y_test = train_test_split(
                    X1, y, train_size=0.8, random_state=0, stratify=y)
clf = RandomForestClassifier(n_estimators=total_models, n_jobs=-1)
clf.fit(x_train, y_train)

np.random.shuffle(y_train)
clf_random = RandomForestClassifier(n_estimators=total_models, n_jobs=-1)
clf_random.fit(x_train, y_train)

feature_imp = []
for ii in range(total_models):
    feature_imp.append(
        clf.estimators_[ii].feature_importances_
    )

for ii in range(total_models):
    feature_imp.append(
        clf_random.estimators_[ii].feature_importances_
    )

In [120]:
feature_imp = np.array(feature_imp)

test = PermutationTest(n_estimators=total_models, feature_importance=feature_imp)
stat, p_val = test.test(n_repeats = reps, n_jobs=20)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:31<00:00,  5.45it/s]


In [121]:
arg_sorted = np.argsort(p_val)
total_features = len(p_val)
corrected_pval = np.zeros(total_features, dtype=float)

for idx in arg_sorted:
    p = p_val[idx]*total_features

    if p>1:
        p=1
        
    corrected_pval[idx] = p
    total_features -= 1

In [122]:
markov_keys = df.keys()[4:186]
pval_df = {}

for ii, key in enumerate(markov_keys):
    pval_df[key] = corrected_pval[ii]

pval_df = pd.DataFrame.from_dict(pval_df, orient='index')

In [123]:
pval_df.to_csv('monkey_markov.csv')

In [107]:
reps = 5
accuracy = 0.0

for ii in tqdm(range(reps)):
    x_train, x_test, y_train, y_test = train_test_split(
                    X1, y, train_size=0.8, random_state=ii, stratify=y)
    clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
    clf.fit(x_train,y_train)
    accuracy += np.mean(clf.predict(x_test)==y_test)

print('Accuracy is ',accuracy/reps)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:09<00:00,  1.94s/it]

Accuracy is  0.6504201680672269





In [124]:
total_models = 1000
reps = 10000

x_train, x_test, y_train, y_test = train_test_split(
                    X2, y, train_size=0.8, random_state=0, stratify=y)
clf = RandomForestClassifier(n_estimators=total_models, n_jobs=-1)
clf.fit(x_train, y_train)

np.random.shuffle(y_train)
clf_random = RandomForestClassifier(n_estimators=total_models, n_jobs=-1)
clf_random.fit(x_train, y_train)

feature_imp = []
for ii in range(total_models):
    feature_imp.append(
        clf.estimators_[ii].feature_importances_
    )

for ii in range(total_models):
    feature_imp.append(
        clf_random.estimators_[ii].feature_importances_
    )

In [125]:
feature_imp = np.array(feature_imp)

test = PermutationTest(n_estimators=total_models, feature_importance=feature_imp)
stat, p_val = test.test(n_repeats = reps, n_jobs=20)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:23<00:00,  5.96it/s]


In [126]:
arg_sorted = np.argsort(p_val)
total_features = len(p_val)
corrected_pval = np.zeros(total_features, dtype=float)

for idx in arg_sorted:
    p = p_val[idx]*total_features

    if p>1:
        p=1
        
    corrected_pval[idx] = p
    total_features -= 1

In [127]:
schaefer_keys = df.keys()[186:]
pval_df = {}

for ii, key in enumerate(schaefer_keys):
    pval_df[key] = corrected_pval[ii]

pval_df = pd.DataFrame.from_dict(pval_df, orient='index')

In [128]:
pval_df.to_csv('monkey_schaefer.csv')

# Common important regions
- Schaefer217.149, Schaefer217.97, Schaefer217.63, Schaefer217.43, Schaefer217.44 
- Markov.145