In [1]:
import pandas as pd
import numpy as np
import math
from copy import deepcopy


#############################
# Import and combine all data
#############################


#DATA_PATH = "/Users/mvonebers/HUBBS-Lab/data/"
DATA_PATH = "/home/maggie/HUBBS-Lab/data/"

e4_data = pd.read_excel(DATA_PATH + "E4_TEST.xlsx")
change_data = pd.read_excel(DATA_PATH + "normalized_change.xlsx")
audio_data = pd.read_excel(DATA_PATH + "audio_TEST.xlsx")
demo_data = pd.read_csv(DATA_PATH + "Demographics Information.csv")


# Break apart the ID column into "person" and "trial"
def clean_id(data):
    data.insert(0, "person", [0] * data.shape[0])
    data.insert(1, "trial", [0] * data.shape[0])

    for i, row in data.iterrows():
        data.at[i, "person"] = int(data.at[i, "id"][7:])
        data.at[i, "trial"] = int(data.at[i, "id"][5])
    
    data = data.drop(columns=['id'])
    data = data.rename(columns={"person": "id"})
    return data

    
e4_data = clean_id(e4_data)
audio_data = clean_id(audio_data)

# Normalize and print out indices for ease of reference
for dframe, name in zip([e4_data, change_data, audio_data, demo_data], ['e4_data', 'change_data', 'audio_data', 'demo_data']):
    for i, col in zip(range(len(dframe.columns.to_list())), dframe.columns.to_list()):
        if not (col == 'id' or col == 'trial'):
            dframe[col] = dframe[col]/dframe[col].max()
        print(name + '[' + str(i) + ']: ' + str(col))

e4_data[0]: id
e4_data[1]: trial
e4_data[2]: EDA_PPT
e4_data[3]: HR_PPT
e4_data[4]: TEMP_PPT
e4_data[5]: BVP_PPT
e4_data[6]: ACC_PPT
e4_data[7]: IBI_PPT
e4_data[8]: EDA_FREQ_PPT
e4_data[9]: EDA_AMP_PPT
change_data[0]: id
change_data[1]: Brief fear of Negative Evaluation
change_data[2]: CAI Trait Dyadic Score
change_data[3]: CAI Trait Full Score
change_data[4]: CAI Trait Public Speaking Score
change_data[5]: CAI Trait Small group Score
change_data[6]: STAI Trait Score
change_data[7]: CAI State Score
change_data[8]: STAI State Score
audio_data[0]: id
audio_data[1]: trial
audio_data[2]: pcm_RMSenergy_sma_amean
audio_data[3]: pcm_fftMag_mfcc_sma[1]_amean
audio_data[4]: pcm_fftMag_mfcc_sma[2]_amean
audio_data[5]: pcm_fftMag_mfcc_sma[3]_amean
audio_data[6]: pcm_fftMag_mfcc_sma[4]_amean
audio_data[7]: pcm_fftMag_mfcc_sma[5]_amean
audio_data[8]: pcm_fftMag_mfcc_sma[6]_amean
audio_data[9]: pcm_fftMag_mfcc_sma[7]_amean
audio_data[10]: pcm_fftMag_mfcc_sma[8]_amean
audio_data[11]: pcm_fftMag_mfcc_

In [2]:
###############################################################
# Get slopes from linear regression of the 8 trials for each ID
###############################################################

from sklearn.linear_model import LinearRegression

def get_slopes(data, start, end):
    iterations = int(data.shape[0]/8)
    y0 = data['trial'].to_numpy(copy=True)

    slopes = pd.DataFrame(np.zeros((iterations, data.shape[1])), columns=data.columns)
    missing_trials = pd.DataFrame(np.zeros((iterations, data.shape[1])), columns=data.columns)
    missing_trials = missing_trials.drop(["trial"], axis=1)
    slopes = slopes.drop(["trial"], axis=1)

    for col in range(2, data.shape[1]):
        x1 = data[data.columns[col]]
        y0 = list(range(start, end + 1))
        for row in range(iterations):
            missing = 0
            x0 = x1[ (row * 8) + start - 1 : (row * 8) + end ].to_numpy()
            x = np.array([])
            y = np.array([])
            
            slopes.iloc[row, 0] = data.iloc[row * 8, 0]

            for i in range(len(x0)):  # remove NaN from data
                if math.isnan(x0[i]) or math.isnan(y0[i]):
                    missing += 1
                elif x0[i] == 0: #and not data.columns[col] == '#pause':
                    missing += 1
                else:
                    x = np.append(x, x0[i])
                    y = np.append(y, y0[i])
                    
            missing_trials.iloc[row, col - 1] = missing
                    
            try:
                reg = LinearRegression().fit(y.reshape(-1,1),x)
                slopes.iloc[row, col - 1] = reg.coef_
            except:
                 slopes.iloc[row, col - 1] = 0
                    
    slopes.replace(0, np.NaN, inplace=True)
    missing_trials['id'] = slopes['id']
    
    return slopes, missing_trials

In [3]:
e4_slopes, e4_missing = get_slopes(e4_data, 1, 8)
audio_slopes, audio_missing = get_slopes(audio_data, 1, 8)

In [95]:
numpause = np.array([audio_data['#pause'].to_numpy()])
freq = np.array([audio_data['pause_frequency'].to_numpy()])
audio_data['pause_frequency'][0]

0.1593750000355078

In [13]:
def prune_pair(x01, x02):
    x1 = x01.copy()
    x2 = x02.copy()
    set01 = set(x01['id'].to_list())
    set02 = set(x02['id'].to_list())
    ids = set01.symmetric_difference(set02)
    print(ids)
    for i in ids:
        x1 = x1[x1.id != i]
        x2 = x2[x2.id != i]
        
    final = x1.merge(x2, how="right")
    return final

{65.0, 66.0, 16.0, 38.0, 46.0, 53.0, 58.0}


Unnamed: 0,id,EDA_PPT,IBI_PPT,#pause,pause_frequency
0,4.0,2e-06,-0.005964,-0.004762,-0.003433
1,5.0,-0.006102,0.000766,0.015774,0.020945
2,8.0,0.002106,-0.021424,0.039881,0.051226
3,20.0,-0.017983,0.028971,0.000205,0.003823
4,21.0,-0.031028,-0.012828,-0.00253,-0.007599
5,23.0,0.001176,0.013448,0.000893,-0.005681
6,32.0,-0.020056,0.023953,0.013542,-0.005396
7,35.0,-0.086381,-0.00042,,0.012492
8,37.0,-0.000264,-0.050482,-0.003814,-0.00383
9,41.0,-0.000784,-0.042194,-0.034821,-0.029613


In [96]:
#from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_predict
from scipy.stats import pearsonr

y_columns = ['CAI State Score', 'CAI Trait Full Score', 'STAI Trait Score']


def get_combo_predictions(X0, slope_data):
    corrs = []
    ps = []
    for y_col, y_i in zip(y_columns, range(len(y_columns))):
        y0 = slope_data[y_col].to_numpy(copy=True)
        X = np.array([X0[0]])
        y = np.array(y0[0])
        
        for i in range(1, len(X0)):  # remove NaN from data
            is_nan = False
            for x in X0[i]:
                if math.isnan(x):
                    is_nan = True
                    break
            if not math.isnan(y0[i]) and not is_nan:
                X = np.append(X, [X0[i]], axis=0)
                y = np.append(y, y0[i])
        
        
        folds = min(10, len(X))
        model = LinearRegression()
        cv = KFold(folds, shuffle=True, random_state=42)
        predicted_vals0 = cross_val_predict(model, X, y, cv=cv)
        actual_vals0 = slope_data[y_col].to_numpy(copy=True)
        #TODO: why this for actual_vals0 isntead of the sanitized y we already got?
        predicted_vals = []
        actual_vals = []
        

        for j in range(len(predicted_vals0)):
            if not math.isnan(predicted_vals0[j]) and not math.isnan(actual_vals0[j]):
                predicted_vals.append(predicted_vals0[j])
                actual_vals.append(actual_vals0[j])

        correlation, pval = pearsonr(predicted_vals, actual_vals)
        #to_print = str(correlation) + str(pval)
        #if pval < 0.15: 
        #    if not folds == 10:
        #        print("With # KFolds", folds)
        #    print("{0}:\t\t{1}\t\t{2}\t{3}\t\t{4}".format(y_col, correlation, pval, "Rows:", len(y)))
        corrs.append(correlation)
        ps.append(pval)
        
    return corrs, ps

def get_predictions(X0, slope_data):
    corrs = []
    ps = []
    num_samples = []
    for y_col, y_i in zip(y_columns, range(len(y_columns))):
        y0 = slope_data[y_col].to_numpy(copy=True)
        X = np.array(X0[0])
        y = np.array(y0[0])
        
        for i in range(1,len(X0)):  # remove NaN from data
            if math.isnan(y0[i]) or math.isnan(X0[i]):
                continue
            else:
                X = np.append(X, X0[i])
                y = np.append(y, y0[i])
            
        X = X.reshape(-1, 1)
        folds = min(10, len(X))
        model = LinearRegression()
        cv = KFold(folds, shuffle=True, random_state=42)
        predicted_vals0 = cross_val_predict(model, X, y, cv=cv)
        actual_vals0 = slope_data[y_col].to_numpy(copy=True)
        predicted_vals = []
        actual_vals = []
        

        for j in range(len(predicted_vals0)):
            if not math.isnan(predicted_vals0[j]) and not math.isnan(actual_vals0[j]):
                predicted_vals.append(predicted_vals0[j])
                actual_vals.append(actual_vals0[j])

        correlation, pval = pearsonr(predicted_vals, actual_vals)
        corrs.append(correlation)
        ps.append(pval)
        num_samples.append(len(actual_vals))
        
    return corrs, ps, num_samples
            
      
def generate_combos(e4_slope_data, audio_slope_data):
    group_c = []
    group_p = []
    group_num_samples = []
    group_titles = []
    
    demo_columns = ['Age', 'Lang', 'ethnicity', 'highest_education']
    
    demos = demo_data[demo_columns]
    all_bios = e4_slope_data[e4_slope_data.columns[1:10]]
    mfccs = audio_slope_data[[['id'] + audio_slope_data.columns[3:15]]]
    pauses = audio_slope_data[[['id'] + audio_slope_data.columns[18:21]]]
    jitter_shimmer = audio_slope_data[[['id'] + audio_slope_data.columns[23:25]]]
    ppts = e4_slope_data[['id', 'HR_PPT', 'EDA_FREQ_PPT']]
    
    group_titles.append('all demos')
    X0_demo = demo_data[demo_columns]
    c, p, num_samples = get_combo_predictions(X0_demo, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    group_titles.append("bios + demos")
    X0 = prune_pair(all_bios, demos)
    X0.drop(['id'])
    X0 = X0.to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    group_titles.append('mfcc1-12 + demos')
    X0 = prune_pair(mfccs, demos)
    X0.drop(['id'])
    X0 = X0.to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)

    group_titles.append('mfcc1-12')
    X0 = mfccs.to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)

    group_titles.append('pauses')
    X0 = pauses.to_numpy(copy=True)
    c, p, num_samples = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    #for col in slope_data.columns[17:20].to_list():
    #    group_titles.append(col)
    #    X0 = slope_data[col].to_numpy(copy=True)
    #    c,p,num_samples = get_predictions(X0, slope_data)
    #    group_c.append(c)
    #    group_p.append(p)
    #    group_num_samples.append(num_samples)


    group_titles.append('mfccs + pauses')
    X0 = prune_pair(mfccs, pauses)
    X0.drop(['id'])
    X0 = X0.to_numpy(copy=True)
    c, p, num_samples = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)

    group_titles.append('jitter, shimmer')
    X0 = jitter_shimmer.to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    # Just the two jitter variables (they performed better)
    #group_titles.append('jitterDDP_sma_amean, jitterLocal_sma_amean')
    #columns = ['jitterDDP_sma_amean', 'jitterLocal_sma_amean']
    #X0_1 = slope_data[columns].to_numpy(copy=True)
    #c, p, num_samples = get_combo_predictions(X0_1, slope_data)
    #group_c.append(c)
    #group_p.append(p)
    #group_num_samples.append(num_samples)
    
    # Jitter, shimmer individually
    #for col in slope_data.columns[22:25].to_list():
    #    group_titles.append(col)
    #    X0 = slope_data[col].to_numpy(copy=True)
    #    c,p,num_samples = get_predictions(X0, slope_data)
    #    group_c.append(c)
    #    group_p.append(p)
    #    group_num_samples.append(num_samples)
    
    group_titles.append('jitter, shimmer, pauses')
    X0_jitter = prune_pair(jitter_shimmer, pauses)
    X0.drop(['id'])
    X0 = X0.to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)

    group_titles.append('mfccs + jitter, shimmer')
    X0 = prune_pair(mfccs, jitter_shimmer)
    X0.drop(['id'])
    X0 = X0.to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    group_titles.append('mfccs + jitter shimmer + pauses')
    temp = prune_pair(mfccs, jitter_shimmer)
    X0 = prune_pair(temp, pauses)
    X0.drop(['id'])
    X0 = X0.to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)

    group_titles.append("all bio")
    X01 = all_bios.copy().drop(['id'])
    X0 = X01.to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0_eda, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    #group_titles.append('HR_PPT, EDA_FREQ_PPT')
    #columns = ['HR_PPT', 'EDA_FREQ_PPT']
    #X0_0 = slope_data[columns].to_numpy(copy=True)
    #c, p, num_samples = get_combo_predictions(X0_0, slope_data)
    #group_c.append(c)
    #group_p.append(p)
    #group_num_samples.append(num_samples)
    
    for col in slope_data.columns[25:33].to_list():
        group_titles.append(col)
        X0 = slope_data[col].to_numpy(copy=True)
        c,p,num_samples = get_predictions(X0, slope_data)
        group_c.append(c)
        group_p.append(p)
        group_num_samples.append(num_samples)
    
    group_titles.append('bio + jitter, shimmer')
    columns = slope_data.columns[25:33].to_list() + slope_data.columns[22:25].to_list()
    X0_bio_mfcc = slope_data[columns].to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0_bio_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    group_titles.append('bio + jitter, shimmer + pauses')
    columns = slope_data.columns[25:33].to_list() + slope_data.columns[22:25].to_list() + slope_data.columns[17:20].to_list()
    X0_bio_mfcc = slope_data[columns].to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0_bio_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    group_titles.append('bio + pauses')
    columns = slope_data.columns[25:33].to_list() + slope_data.columns[17:20].to_list()
    X0_bio_mfcc = slope_data[columns].to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0_bio_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)

    group_titles.append('bio + mfcc')
    columns = slope_data.columns[25:33].to_list() + slope_data.columns[2:14].to_list()
    X0_bio_mfcc = slope_data[columns].to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0_bio_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    group_titles.append('all of em')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[25:33].to_list() + slope_data.columns[22:25].to_list() + slope_data.columns[17:20].to_list() + demo_columns
    X0 = slope_data[columns].to_numpy(copy=True)
    c, p, num_samples = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    return group_c, group_p, group_titles


def combos_with_demo(slope_data, demo_col):
    group_c = []
    group_p = []
    group_titles = []
    
    #demo_columns = [slope_data.columns[36], slope_data.columns[38], slope_data.columns[41], slope_data.columns[43]]

    group_titles.append('mfcc1-12')
    columns = slope_data.columns[2:14].to_list() + [demo_col]
    X0_mfcc = slope_data[columns].to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)

    group_titles.append('pauses')
    columns = slope_data.columns[17:20].to_list() + [demo_col]
    X0_pauses = slope_data[columns].to_numpy(copy=True) # #pause, pause_frequency, pause_interval
    c, p, num_samples = get_combo_predictions(X0_pauses, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    for col in slope_data.columns[17:20].to_list():
        group_titles.append(col)
        X0 = slope_data[col].to_numpy(copy=True)
        c,p,num_samples = get_predictions(X0, slope_data)
        group_c.append(c)
        group_p.append(p)
        group_num_samples.append(num_samples)

    group_titles.append('mfccs + pauses')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[17:20].to_list() + [demo_col]
    X0_pauses = slope_data[columns].to_numpy(copy=True) # #pause, pause_frequency, pause_interval
    c, p, num_samples = get_combo_predictions(X0_pauses, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)

    group_titles.append('jitter, shimmer')
    columns = slope_data.columns[22:25].to_list() + [demo_col]
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    for col in slope_data.columns[22:25].to_list():
        group_titles.append(col)
        X0 = slope_data[[col, demo_col]].to_numpy(copy=True)
        c,p,num_samples = get_combo_predictions(X0, slope_data)
        group_c.append(c)
        group_p.append(p)
        group_num_samples.append(num_samples)
    
    group_titles.append('jitter, shimmer, pauses')
    columns = slope_data.columns[22:25].to_list() + slope_data.columns[17:20].to_list() + [demo_col]
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)

    group_titles.append('mfccs + jitter, shimmer')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[22:25].to_list() + [demo_col]
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    group_titles.append('mfccs + jitter shimmer + pauses')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[22:25].to_list() + slope_data.columns[17:20].to_list() + [demo_col]
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)

    group_titles.append("all bio")
    columns = slope_data.columns[25:33].to_list() + [demo_col]
    X0_eda = slope_data[columns].to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0_eda, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    for col in slope_data.columns[25:33].to_list():
        group_titles.append(col)
        X0 = slope_data[[col, demo_col]].to_numpy(copy=True)
        c, p, num_samples = get_combo_predictions(X0, slope_data)
        group_c.append(c)
        group_p.append(p)
        group_num_samples.append(num_samples)
    
    group_titles.append('bio + jitter, shimmer')
    columns = slope_data.columns[25:33].to_list() + slope_data.columns[22:25].to_list() + [demo_col]
    X0_eda = slope_data[columns].to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0_eda, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    group_titles.append('bio + jitter, shimmer + pauses')
    columns = slope_data.columns[25:33].to_list() + slope_data.columns[22:25].to_list() + slope_data.columns[17:20].to_list() + [demo_col]
    X0_eda = slope_data[columns].to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0_eda, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    group_titles.append('bio + pauses')
    columns = slope_data.columns[25:33].to_list() + slope_data.columns[17:20].to_list() + [demo_col]
    X0_eda = slope_data[columns].to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0_eda, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)

    group_titles.append('bio + mfcc')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[25:33].to_list() + [demo_col]
    X0_bio_mfcc = slope_data[columns].to_numpy(copy=True)
    c,p,num_samples = get_combo_predictions(X0_bio_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    group_titles.append('all of em')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[25:33].to_list() + slope_data.columns[22:25].to_list() + slope_data.columns[17:20].to_list() + [demo_col]
    X0 = slope_data[columns].to_numpy(copy=True)
    c,p, num_samples = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)
    group_num_samples.append(num_samples)
    
    return group_c, group_p, group_titles

IndentationError: expected an indented block (<ipython-input-96-bf1fcbede1f3>, line 100)

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

def generate_graphs(c, p, num, titles, specifier):
    short_y_col = ["CAI St(19)", "CAI F(18)" , "STAI T(17)"]
    correlations = np.matrix(c)
    pvalues = np.matrix(p)
    sample_sizes = np.matrix(num)

    correlations = np.round(correlations, decimals=2)
    pvalues = np.round(pvalues, decimals=2)
    
    graph_title = "correlations_" + specifier + ".png"
    
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20,85))
        
    p = ax.pcolor(correlations, vmin=-0.5, vmax=0.8)
    fig.colorbar(p, ax=ax, fraction=0.05, pad=0.04)
    ax.set_xticklabels(labels=short_y_col)
    ax.set_yticklabels(labels=titles)
    plt.yticks(np.arange(0, len(titles), 1.0))
    for tick in ax.get_xticklabels():
        tick.set_rotation(30)  

    for i in range(len(short_y_col)):
        for j in range(len(titles)):
            txt = 'c=' + str(correlations[j,i]) + ', p=' + str(pvalues[j,i]) + ', n=' + sample_sizes[j, i])
            text = ax.text(i + 0.5, j + 0.5, txt, ha="center", va="center", color="w")

    fig.tight_layout()
    plt.savefig(DATA_PATH + 'expanded_features/' + graph_title)
    plt.close()