In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np 
import math
# https://nilearn.github.io/connectivity/region_extraction.html#visualization-of-functional-connectivity-matrices

In [None]:
# Reload original dataframes

demographics_og_df = pd.read_csv('Cowork-data-20210215/COBRE_demographics.csv')
cortical_thickness_og_df = pd.read_csv('Cowork-data-20210215/COBRE_cortical_thickness.csv')
connectivity_og_df = pd.read_csv('Cowork-data-20210215/COBRE_fmri_connectivity.csv', names=[i for i in range(85849)])
regions_og_df = pd.read_csv('Cowork-data-20210215/COBRE_fmri_regions.csv')

In [None]:
# Reload Dataframes quickly 

demographics_df = demographics_og_df.copy()
cortical_thickness_df = cortical_thickness_og_df.copy()
connectivity_df = connectivity_og_df.copy()
regions_df = regions_og_df.copy()

In [None]:
print(demographics_df.shape)
print(cortical_thickness_df.shape)
print(connectivity_df.shape)
print(regions_df.shape)

In [None]:
# Make PANSS Graphs
female_symptom_data = []
male_symptom_data = []

for index, row in demographics_df.iterrows():
    if row.labels == 1:
        if not math.isnan(row.PANSS_total):
            if row.sex == -1:
                male_symptom_data.append(row.PANSS_total)
            else:
                female_symptom_data.append(row.PANSS_total)

            
print(len(male_symptom_data))
print(len(female_symptom_data))
bp_data = [np.array(male_symptom_data), np.array(female_symptom_data)]
print(bp_data[0])
print(len(bp_data))
fig = plt.figure(figsize =(7, 7)) 
  
# Creating axes instance 
ax = fig.add_subplot(111) 

ax.set_xticklabels(['Male', 'Female']) 
  
# Creating plot 
bp = ax.boxplot(bp_data) 
  
# show plot 
plt.show() 

In [None]:
labels = demographics_df.labels.values

In [None]:
data = connectivity_df.values
print(data.shape)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=42)


clf = RandomForestClassifier(n_estimators=5000, max_depth=100, random_state=0)
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import f1_score, accuracy_score

y_pred = clf.predict(X_test)
print(f1_score(y_test, y_pred, average='macro'))
print(accuracy_score(y_test, y_pred))

In [None]:
importances = clf.feature_importances_


sorted_indices = np.argsort(importances)[::-1]
a = np.array(importances).reshape(293,293)

In [None]:
plt.figure(figsize=(20,20))
hm = plt.imshow(a, cmap='RdYlGn', interpolation='nearest')
plt.colorbar(hm)
plt.show()

- Ran a random forest classifier to classify shizophrenia, vs control 
- From this classification, achieved around 70% accuracy
- Observed the feature importances to determine the brain regions which are important to classifying a patient as having shizophrenia 

In [None]:
print(len(a))
final = []
for i, item in enumerate(a):
    final.append(sum(item))
    
sorted_indices = np.argsort(final)[::-1]
print(sorted_indices[:25])
new_a = a[sorted_indices[:50]]
print(new_a.shape)

# final_a = [item[sorted_indices[:50]] for item in new_a]
# final_a = np.array(final_a)
# print(final_a.shape)

plt.figure(figsize=(20,20))
hm = plt.imshow(new_a, cmap='RdYlGn', interpolation='nearest')
plt.colorbar(hm)
plt.show()


# Sex-Based Differences

Top 25 Regions: [241 172  96  39  24 242  94  52  68 207 231 276 162  78 138 233 214 277
  83  10 171  67 129 132 126]

In [None]:
# Recreate dataset with only top 25 regions and only M/F Schizophrenia patients 

final_dataset = []
final_labels = []

for index, row in demographics_df.iterrows():
    if row.labels == 1:
        final_dataset.append(connectivity_df.iloc[index].values)
        if row.sex == -1:
            final_labels.append(-1)
        else:
            final_labels.append(1)
                
final_dataset = np.array(final_dataset)
print(final_dataset.shape)
# print(len(final_labels))
# print(final_labels)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(final_dataset, final_labels, test_size=0.33, random_state=42)


clf = RandomForestClassifier(n_estimators=1000, max_depth=100, random_state=0)
clf.fit(X_train, y_train)

from sklearn.metrics import f1_score, accuracy_score

y_pred = clf.predict(X_test)
print(f1_score(y_test, y_pred, average='macro'))
print(accuracy_score(y_test, y_pred))

Originally found very low f_measure for classifying sex differences

In [None]:
rois = [241, 172, 96, 39, 24, 242, 94, 52, 68, 207, 231, 276, 162, 78, 
        138, 233, 214, 277, 83, 10, 171, 67, 129, 132, 126]
rois = rois[:5]

smaller_dataset = []
for subject in final_dataset:
    n_n = np.array(subject).reshape(293,293)
    temp = np.array(n_n[rois])
    print(temp.shape)
#     temp = np.array([row[rois] for row in temp])
    smaller_dataset.append(temp.ravel())
    
smaller_dataset = np.array(smaller_dataset)
print(smaller_dataset.shape)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(smaller_dataset, final_labels, test_size=0.33, random_state=42)


clf = RandomForestClassifier(n_estimators=5, max_depth=2, random_state=42)
clf.fit(X_train, y_train)

from sklearn.metrics import f1_score, accuracy_score

y_pred = clf.predict(X_test)
print(f1_score(y_test, y_pred, average='macro'))
print(accuracy_score(y_test, y_pred))

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import LinearSVC

# Using Polynomial Features
poly_svm_clf = Pipeline([
                     ('poly_features', PolynomialFeatures(degree=3)),
                     ("svm_clf", LinearSVC(C=0.01, loss="hinge"))
])

poly_svm_clf.fit(X_train, y_train)

y_pred = poly_svm_clf.predict(X_test)
print(f1_score(y_test, y_pred, average='macro'))
print(accuracy_score(y_test, y_pred))