In [1]:
import pandas as pd
import numpy as np
import math
from copy import deepcopy


#############################
# Import and combine all data
#############################


DATA_PATH = "/Users/mvonebers/HUBBS-Lab/data/"
#DATA_PATH = "/home/maggie/HUBBS-Lab/data/"

e4_data = pd.read_excel(DATA_PATH + "E4_TEST.xlsx")
change_data = pd.read_excel(DATA_PATH + "normalized_change.xlsx")
audio_data = pd.read_excel(DATA_PATH + "audio_TEST.xlsx")
demo_data = pd.read_csv(DATA_PATH + "Demographics Information.csv")


# Break apart the ID column into "person" and "trial"
def clean_id(data):
    data.insert(0, "person", [0] * data.shape[0])
    data.insert(1, "trial", [0] * data.shape[0])

    for i, row in data.iterrows():
        data.at[i, "person"] = int(data.at[i, "id"][7:])
        data.at[i, "trial"] = int(data.at[i, "id"][5])
    
    data = data.drop(columns=['id'])
    data = data.rename(columns={"person": "id"})
    return data

    
e4_data = clean_id(e4_data)
audio_data = clean_id(audio_data)    
    
all_data = pd.merge(e4_data, change_data, on='id')
all_data = audio_data.merge(all_data, how='right')


# Reorder survey data in order of most samples to least
columns = all_data.columns.to_list()
new_columns = deepcopy(columns)
new_columns[35] = columns[40]
new_columns[37] = columns[41] 
new_columns[38] = columns[37] 
new_columns[40] = columns[35]
new_columns[41] = columns[38]

all_data = all_data[new_columns]


# what does this do?
#demo_ids = demo_data['id'].to_list()
#
#for id_ in demo_ids:
#    if id_ not in slope_ids:
#        demo_data = demo_data[demo_data.id != id_]
        
all_data = all_data.merge(demo_data, how="right")

# The demographic data lists IDs that aren't present in the other data, so remove them
all_data = all_data[all_data.id != 16]
all_data = all_data[all_data.id != 27]
all_data = all_data[all_data.id != 38]
all_data = all_data[all_data.id != 43]
all_data = all_data[all_data.id != 46]
all_data = all_data[all_data.id != 49]
all_data = all_data[all_data.id != 53]
all_data = all_data[all_data.id != 58]
all_data = all_data[all_data.id != 65]
all_data = all_data[all_data.id != 66]

all_data.drop(['CAI Trait Small group Score', 'CAI Trait Dyadic Score', 'CAI Trait Public Speaking Score', 'STAI State Score', 'Brief fear of Negative Evaluation'], axis=1, inplace=True)
all_data

Unnamed: 0,id,trial,pcm_RMSenergy_sma_amean,pcm_fftMag_mfcc_sma[1]_amean,pcm_fftMag_mfcc_sma[2]_amean,pcm_fftMag_mfcc_sma[3]_amean,pcm_fftMag_mfcc_sma[4]_amean,pcm_fftMag_mfcc_sma[5]_amean,pcm_fftMag_mfcc_sma[6]_amean,pcm_fftMag_mfcc_sma[7]_amean,...,CAI Trait Full Score,STAI Trait Score,Age,Gender,Lang,college,presentation,ethnicity,presentation_3_months,highest_education
0,4,1.0,0.007879,0.956435,-13.176012,-4.782369,-5.508350,-6.742012,-13.622468,-4.198480,...,0.101695,-0.068182,3.0,1.0,2.0,1.0,2.0,1.0,0.0,3.0
1,4,2.0,0.006843,-0.399977,-12.222916,-8.252062,-2.672821,-3.437455,-14.641911,-2.807243,...,0.101695,-0.068182,3.0,1.0,2.0,1.0,2.0,1.0,0.0,3.0
2,4,3.0,0.003532,0.615522,-9.837350,-2.540836,-2.281998,-3.958653,-14.158829,1.465646,...,0.101695,-0.068182,3.0,1.0,2.0,1.0,2.0,1.0,0.0,3.0
3,4,4.0,0.004056,0.912469,-8.391797,-7.296996,-3.236593,-1.216842,-14.971951,-1.150219,...,0.101695,-0.068182,3.0,1.0,2.0,1.0,2.0,1.0,0.0,3.0
4,4,5.0,0.000788,-1.409059,-1.801911,-5.151633,-2.803752,-6.043437,-5.157262,-2.180702,...,0.101695,-0.068182,3.0,1.0,2.0,1.0,2.0,1.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,73,4.0,0.016747,-5.346863,-4.401036,-1.772596,-6.538305,5.228556,-17.409770,-3.816441,...,0.233333,0.033333,1.0,2.0,1.0,1.0,1.0,3.0,2.0,1.0
148,73,5.0,0.019861,-6.544580,-0.529959,-4.315085,-7.491222,9.523307,-22.557555,-2.752085,...,0.233333,0.033333,1.0,2.0,1.0,1.0,1.0,3.0,2.0,1.0
149,73,6.0,0.023864,-6.224142,-1.605599,-4.598936,-8.447270,9.679458,-22.852172,-0.745983,...,0.233333,0.033333,1.0,2.0,1.0,1.0,1.0,3.0,2.0,1.0
150,73,7.0,0.019280,-6.070041,-1.986066,-4.606451,-5.151283,6.730979,-23.007388,-1.617074,...,0.233333,0.033333,1.0,2.0,1.0,1.0,1.0,3.0,2.0,1.0


In [2]:
###############################################################
# Get slopes from linear regression of the 8 trials for each ID
###############################################################

from sklearn.linear_model import LinearRegression

def get_slopes(data, start, end):
    y0 = data['trial'].to_numpy(copy=True)

    slopes = pd.DataFrame(np.zeros((19, 45)), columns=data.columns)
    slopes = slopes.drop(["trial"], axis=1)

    for col in range(2, 34):
        x1 = data[data.columns[col]]
        y0 = list(range(start, end + 1))
        for row in range(19):
            x0 = x1[ (row * 8) + start - 1 : (row * 8) + end ].to_numpy()
            x = np.array([])
            y = np.array([])
            
            slopes.iloc[row, 0] = data.iloc[row * 8, 0]

            for i in range(len(x0)):  # remove NaN from data
                if not math.isnan(x0[i]) and not math.isnan(y0[i]):
                    x = np.append(x, x0[i])
                    y = np.append(y, y0[i])
            
            try:
                reg = LinearRegression().fit(y.reshape(-1,1),x)
                slopes.iloc[row, col - 1] = reg.coef_
            except:
                 slopes.iloc[row, col - 1] = 0
    
    for col in range(34, 45):
        for row in range(19):
            slopes.iloc[row, col - 1] = data.iloc[row * 8, col]
            
    # Want to preserve zeros in the demographic data, so temporarily boost it up one...
    for col in range(43, 45):
        for row in range(19):
            slopes.iloc[row, col - 1] += 1.0
                    
    slopes.replace(0, np.NaN, inplace=True)
    
    # Then bump it back down. (I know this is a dumb way to do this.)
    for col in range(43, 45):
        for row in range(19):
            slopes.iloc[row, col - 1] -= 1.0
    
    return slopes

all_slopes = get_slopes(all_data, 1, 8)

all_slopes

Unnamed: 0,id,pcm_RMSenergy_sma_amean,pcm_fftMag_mfcc_sma[1]_amean,pcm_fftMag_mfcc_sma[2]_amean,pcm_fftMag_mfcc_sma[3]_amean,pcm_fftMag_mfcc_sma[4]_amean,pcm_fftMag_mfcc_sma[5]_amean,pcm_fftMag_mfcc_sma[6]_amean,pcm_fftMag_mfcc_sma[7]_amean,pcm_fftMag_mfcc_sma[8]_amean,...,CAI Trait Full Score,STAI Trait Score,Age,Gender,Lang,college,presentation,ethnicity,presentation_3_months,highest_education
0,4.0,-0.000864,-1.188091,1.458396,-0.644212,-0.697069,-0.543809,2.068849,-1.118681,0.597193,...,0.101695,-0.068182,3.0,1.0,2.0,1.0,2.0,1.0,0.0,3.0
1,5.0,-0.004605,0.022244,0.041306,0.37102,1.856301,-2.347585,2.85708,-0.310527,-0.021322,...,0.121212,0.025641,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0
2,8.0,0.001767,-0.545172,-0.233862,0.693634,-1.303848,-0.635333,0.913341,-0.946743,-0.024012,...,0.128205,0.155172,1.0,2.0,1.0,1.0,3.0,2.0,2.0,1.0
3,20.0,-0.003113,0.120467,-0.32647,0.884765,-0.090467,0.182058,-0.183817,0.728905,-0.347066,...,0.043478,0.04,1.0,1.0,2.0,3.0,1.0,2.0,2.0,1.0
4,21.0,-0.000428,-0.201158,-0.268297,0.201903,-0.366338,0.377646,0.226589,-0.530547,0.462883,...,0.121212,0.114286,1.0,1.0,1.0,1.0,2.0,2.0,0.0,1.0
5,23.0,-0.002165,-0.024271,0.128524,0.470709,1.293796,-1.696811,3.038422,-0.332586,-0.556816,...,0.066667,0.1,1.0,1.0,1.0,1.0,2.0,2.0,0.0,1.0
6,32.0,-0.001112,0.432318,-0.105907,1.495445,-1.006755,1.001877,0.911382,-0.181762,-0.036236,...,0.272727,-0.214286,2.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0
7,35.0,-0.003161,0.366671,0.193564,-0.023716,0.258827,0.975375,-0.922304,0.307595,-0.030484,...,,-0.068966,1.0,2.0,1.0,3.0,4.0,1.0,2.0,1.0
8,37.0,-0.000485,0.39413,0.261597,-0.139036,0.246072,0.183514,0.162386,-0.175606,-0.127135,...,0.025641,-0.107143,1.0,1.0,,4.0,2.0,3.0,1.0,1.0
9,41.0,-0.000964,-0.020902,0.018043,0.745624,0.887429,-3.439343,1.431305,-0.421041,-1.802264,...,0.138462,,2.0,2.0,1.0,1.0,3.0,3.0,2.0,2.0


In [3]:
# Normalization
for col_name in all_slopes.columns.to_list():
    all_slopes[col_name] = all_slopes[col_name]/all_slopes[col_name].max()

#all_slopes["pause_frequency"] = all_slopes["pause_frequency"]/all_slopes["pause_frequency"].max()
#all_slopes["pause_interval"] = all_slopes["pause_interval"]/all_slopes["pause_interval"].max()

In [13]:
for i, col in zip(range(len(all_slopes.columns)), all_slopes.columns):
    print(i, col)

0 id
1 pcm_RMSenergy_sma_amean
2 pcm_fftMag_mfcc_sma[1]_amean
3 pcm_fftMag_mfcc_sma[2]_amean
4 pcm_fftMag_mfcc_sma[3]_amean
5 pcm_fftMag_mfcc_sma[4]_amean
6 pcm_fftMag_mfcc_sma[5]_amean
7 pcm_fftMag_mfcc_sma[6]_amean
8 pcm_fftMag_mfcc_sma[7]_amean
9 pcm_fftMag_mfcc_sma[8]_amean
10 pcm_fftMag_mfcc_sma[9]_amean
11 pcm_fftMag_mfcc_sma[10]_amean
12 pcm_fftMag_mfcc_sma[11]_amean
13 pcm_fftMag_mfcc_sma[12]_amean
14 pcm_zcr_sma_amean
15 voiceProb_sma_amean
16 F0_sma_amean
17 #pause
18 pause_frequency
19 pause_interval
20 mean
21 percent
22 jitterLocal_sma_amean
23 jitterDDP_sma_amean
24 shimmerLocal_sma_amean
25 EDA_PPT
26 HR_PPT
27 TEMP_PPT
28 BVP_PPT
29 ACC_PPT
30 IBI_PPT
31 EDA_FREQ_PPT
32 EDA_AMP_PPT
33 CAI State Score
34 CAI Trait Full Score
35 STAI Trait Score
36 Age
37 Gender
38 Lang
39 college
40 presentation
41 ethnicity
42 presentation_3_months
43 highest_education


In [33]:
#from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_predict
from scipy.stats import pearsonr

# Problem: CAI State values are showing up as the CAI Dyadic...

y_columns = ['CAI State Score', 'CAI Trait Full Score', 'STAI Trait Score']


def get_combo_predictions(X0, slope_data):
    corrs = []
    ps = []
    for y_col, y_i in zip(y_columns, range(len(y_columns))):
        y0 = slope_data[y_col].to_numpy(copy=True)
        X = np.array([X0[0]])
        y = np.array(y0[0])
        
        for i in range(1, len(X0)):  # remove NaN from data
            is_nan = False
            for x in X0[i]:
                if math.isnan(x):
                    is_nan = True
                    break
            if not math.isnan(y0[i]) and not is_nan:
                X = np.append(X, [X0[i]], axis=0)
                y = np.append(y, y0[i])
        
        
        folds = min(10, len(X))
        model = LinearRegression()
        cv = KFold(folds, shuffle=True, random_state=42)
        predicted_vals0 = cross_val_predict(model, X, y, cv=cv)
        actual_vals0 = slope_data[y_col].to_numpy(copy=True)
        #TODO: why this for actual_vals0 isntead of the sanitized y we already got?
        predicted_vals = []
        actual_vals = []
        

        for j in range(len(predicted_vals0)):
            if not math.isnan(predicted_vals0[j]) and not math.isnan(actual_vals0[j]):
                predicted_vals.append(predicted_vals0[j])
                actual_vals.append(actual_vals0[j])

        correlation, pval = pearsonr(predicted_vals, actual_vals)
        #to_print = str(correlation) + str(pval)
        #if pval < 0.15: 
        #    if not folds == 10:
        #        print("With # KFolds", folds)
        #    print("{0}:\t\t{1}\t\t{2}\t{3}\t\t{4}".format(y_col, correlation, pval, "Rows:", len(y)))
        corrs.append(correlation)
        ps.append(pval)
        
    return corrs, ps

def get_predictions(X0, slope_data):
    corrs = []
    ps = []
    for y_col, y_i in zip(y_columns, range(len(y_columns))):
        y0 = slope_data[y_col].to_numpy(copy=True)
        X = np.array(X0[0])
        y = np.array(y0[0])
        
        for i in range(1,len(X0)):  # remove NaN from data
            if math.isnan(y0[i]) or math.isnan(X0[i]):
                continue
            else:
                X = np.append(X, X0[i])
                y = np.append(y, y0[i])
            
        X = X.reshape(-1, 1)
        folds = min(10, len(X))
        model = LinearRegression()
        cv = KFold(folds, shuffle=True, random_state=42)
        predicted_vals0 = cross_val_predict(model, X, y, cv=cv)
        actual_vals0 = slope_data[y_col].to_numpy(copy=True)
        predicted_vals = []
        actual_vals = []
        

        for j in range(len(predicted_vals0)):
            if not math.isnan(predicted_vals0[j]) and not math.isnan(actual_vals0[j]):
                predicted_vals.append(predicted_vals0[j])
                actual_vals.append(actual_vals0[j])

        correlation, pval = pearsonr(predicted_vals, actual_vals)
        corrs.append(correlation)
        ps.append(pval)
        
    return corrs, ps
      
def generate_combos(slope_data):
    group_c = []
    group_p = []
    group_titles = []
    
    demo_columns = [slope_data.columns[36], slope_data.columns[38], slope_data.columns[41], slope_data.columns[43]]
    
    group_titles.append('all demos')
    X0_demo = slope_data[demo_columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_demo, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append("bios + demos")
    columns = slope_data.columns[25:33].to_list() + demo_columns
    X0 = slope_data[columns].to_numpy(copy=True)
    c, p = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append('mfcc1-12 + demos')
    columns = slope_data.columns[2:14].to_list() + demo_columns
    X0 = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('mfcc1-12')
    X0_mfcc = slope_data[slope_data.columns[2:14]].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('pauses')
    #X0_pauses = slope_data[slope_data.columns[17:20]].to_numpy(copy=True) # #pause, pause_frequency, pause_interval
    columns = slope_data.columns[17:20].to_list()
    X0_pauses = slope_data[columns].to_numpy(copy=True)
    c, p = get_combo_predictions(X0_pauses, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    for col in slope_data.columns[17:20].to_list():
        group_titles.append(col)
        X0 = slope_data[col].to_numpy(copy=True)
        c,p = get_predictions(X0, slope_data)
        group_c.append(c)
        group_p.append(p)

    group_titles.append('mfccs + pauses')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[17:20].to_list()
    X0_pauses = slope_data[columns].to_numpy(copy=True) # #pause, pause_frequency, pause_interval
    c, p = get_combo_predictions(X0_pauses, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('jitter, shimmer')
    X0_jitter = slope_data[slope_data.columns[22:25]].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    # Just the two jitter variables (they performed better)
    group_titles.append('jitterDDP_sma_amean, jitterLocal_sma_amean')
    columns = ['jitterDDP_sma_amean', 'jitterLocal_sma_amean']
    X0_1 = slope_data[columns].to_numpy(copy=True)
    c, p = get_combo_predictions(X0_1, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    # Jitter, shimmer individually
    for col in slope_data.columns[22:25].to_list():
        group_titles.append(col)
        X0 = slope_data[col].to_numpy(copy=True)
        c,p = get_predictions(X0, slope_data)
        group_c.append(c)
        group_p.append(p)
    
    group_titles.append('jitter, shimmer, pauses')
    columns = slope_data.columns[22:25].to_list() + slope_data.columns[17:20].to_list()
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('mfccs + jitter, shimmer')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[22:25].to_list()
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append('mfccs + jitter shimmer + pauses')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[22:25].to_list() + slope_data.columns[17:20].to_list()
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append("all bio")
    X0_eda = slope_data[slope_data.columns[25:33]].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_eda, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append('HR_PPT, EDA_FREQ_PPT')
    columns = ['HR_PPT', 'EDA_FREQ_PPT']
    X0_0 = slope_data[columns].to_numpy(copy=True)
    c, p = get_combo_predictions(X0_0, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    for col in slope_data.columns[25:33].to_list():
        group_titles.append(col)
        X0 = slope_data[col].to_numpy(copy=True)
        c,p = get_predictions(X0, slope_data)
        group_c.append(c)
        group_p.append(p)
    
    group_titles.append('bio + jitter, shimmer')
    columns = slope_data.columns[25:33].to_list() + slope_data.columns[22:25].to_list()
    X0_bio_mfcc = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_bio_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append('bio + jitter, shimmer + pauses')
    columns = slope_data.columns[25:33].to_list() + slope_data.columns[22:25].to_list() + slope_data.columns[17:20].to_list()
    X0_bio_mfcc = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_bio_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append('bio + pauses')
    columns = slope_data.columns[25:33].to_list() + slope_data.columns[17:20].to_list()
    X0_bio_mfcc = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_bio_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('bio + mfcc')
    columns = slope_data.columns[25:33].to_list() + slope_data.columns[2:14].to_list()
    X0_bio_mfcc = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_bio_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append('all of em')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[25:33].to_list() + slope_data.columns[22:25].to_list() + slope_data.columns[17:20].to_list() + demo_columns
    X0 = slope_data[columns].to_numpy(copy=True)
    c, p = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    return group_c, group_p, group_titles


def combos_with_demo(slope_data, demo_col):
    group_c = []
    group_p = []
    group_titles = []
    
    #demo_columns = [slope_data.columns[36], slope_data.columns[38], slope_data.columns[41], slope_data.columns[43]]

    group_titles.append('mfcc1-12')
    columns = slope_data.columns[2:14].to_list() + [demo_col]
    X0_mfcc = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('pauses')
    columns = slope_data.columns[17:20].to_list() + [demo_col]
    X0_pauses = slope_data[columns].to_numpy(copy=True) # #pause, pause_frequency, pause_interval
    c, p = get_combo_predictions(X0_pauses, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    for col in slope_data.columns[17:20].to_list():
        group_titles.append(col)
        X0 = slope_data[col].to_numpy(copy=True)
        c,p = get_predictions(X0, slope_data)
        group_c.append(c)
        group_p.append(p)

    group_titles.append('mfccs + pauses')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[17:20].to_list() + [demo_col]
    X0_pauses = slope_data[columns].to_numpy(copy=True) # #pause, pause_frequency, pause_interval
    c, p = get_combo_predictions(X0_pauses, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('jitter, shimmer')
    columns = slope_data.columns[22:25].to_list() + [demo_col]
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    for col in slope_data.columns[22:25].to_list():
        group_titles.append(col)
        X0 = slope_data[[col, demo_col]].to_numpy(copy=True)
        c,p = get_combo_predictions(X0, slope_data)
        group_c.append(c)
        group_p.append(p)
    
    group_titles.append('jitter, shimmer, pauses')
    columns = slope_data.columns[22:25].to_list() + slope_data.columns[17:20].to_list() + [demo_col]
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('mfccs + jitter, shimmer')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[22:25].to_list() + [demo_col]
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append('mfccs + jitter shimmer + pauses')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[22:25].to_list() + slope_data.columns[17:20].to_list() + [demo_col]
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append("all bio")
    columns = slope_data.columns[25:33].to_list() + [demo_col]
    X0_eda = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_eda, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    for col in slope_data.columns[25:33].to_list():
        group_titles.append(col)
        X0 = slope_data[[col, demo_col]].to_numpy(copy=True)
        c, p = get_combo_predictions(X0, slope_data)
        group_c.append(c)
        group_p.append(p)
    
    group_titles.append('bio + jitter, shimmer')
    columns = slope_data.columns[25:33].to_list() + slope_data.columns[22:25].to_list() + [demo_col]
    X0_eda = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_eda, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append('bio + jitter, shimmer + pauses')
    columns = slope_data.columns[25:33].to_list() + slope_data.columns[22:25].to_list() + slope_data.columns[17:20].to_list() + [demo_col]
    X0_eda = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_eda, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append('bio + pauses')
    columns = slope_data.columns[25:33].to_list() + slope_data.columns[17:20].to_list() + [demo_col]
    X0_eda = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_eda, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('bio + mfcc')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[25:33].to_list() + [demo_col]
    X0_bio_mfcc = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_bio_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append('all of em')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[25:33].to_list() + slope_data.columns[22:25].to_list() + slope_data.columns[17:20].to_list() + [demo_col]
    X0 = slope_data[columns].to_numpy(copy=True)
    c, p = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    return group_c, group_p, group_titles

In [24]:
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

def generate_graphs(c, p, titles, specifier):
    short_y_col = ["CAI St(19)", "CAI F(18)" , "STAI T(17)"]
    correlations = np.matrix(c)
    pvalues = np.matrix(p)

    correlations = np.round(correlations, decimals=2)
    pvalues = np.round(pvalues, decimals=2)
    
    data = [correlations, pvalues]
    graph_titles = ["correlations_" + specifier + ".png", "pvalues_" + specifier + ".png"]
    
    for k in range(2):
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20,85))
        
        if k == 0:
            p = ax.pcolor(data[k], vmin=-0.5, vmax=0.8)
        else:
            p = ax.pcolor(data[k], vmin = 0.0, vmax = 1.0)
        #p = ax.matshow(data[k])
        fig.colorbar(p, ax=ax, fraction=0.05, pad=0.04)
        ax.set_xticklabels(labels=short_y_col)
        ax.set_yticklabels(labels=titles)
        plt.yticks(np.arange(0, len(titles), 1.0))
        for tick in ax.get_xticklabels():
            tick.set_rotation(30)  

        for i in range(len(short_y_col)):
            for j in range(len(titles)):
                text = ax.text(i + 0.5, j + 0.5, data[k][j, i], ha="center", va="center", color="w")

        fig.tight_layout()
        plt.savefig(DATA_PATH + 'expanded_features/' + graph_titles[k])
        plt.close()

In [34]:
for i in range(36,44):
    c, p, titles = combos_with_demo(all_slopes, all_slopes.columns[i])
    filename = "with-" + str(all_slopes.columns[i])
    generate_graphs(c, p, titles, filename)

In [5]:
#c1, p1, titles1 = combos_with_demo(all_slopes, "highest_education")
#generate_graphs(c, p, titles, "with-highestedu")
#c2, p2, titles2 = combos_with_demo(all_slopes, "highest_education")
#generate_graphs(c, p, titles, "with-highestedu")
#c3, p3, titles3 = combos_with_demo(all_slopes, "Age")
#generate_graphs(c, p, titles, "with-highestedu")

short_y_col = ["CAI St(19)", "CAI F(18)" , "STAI T(17)"]
correlations = np.matrix(c)
pvalues = np.matrix(p)

correlations = np.round(correlations, decimals=2)
pvalues = np.round(pvalues, decimals=2)
    
data = [correlations, pvalues]
graph_titles = ["correlations_" + specifier + ".png", "pvalues_" + specifier + ".png"]
    
for k in range(2):
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20,85))
        
    if k == 0:
        p = ax.pcolor(data[k], vmin=-0.5, vmax=0.8)
    else:
        p = ax.pcolor(data[k], vmin = 0.0, vmax = 1.0)
    #p = ax.matshow(data[k])
    fig.colorbar(p, ax=ax, fraction=0.05, pad=0.04)
    ax.set_xticklabels(labels=short_y_col)
    ax.set_yticklabels(labels=titles)
    plt.yticks(np.arange(0, len(titles), 1.0))
    for tick in ax.get_xticklabels():
        tick.set_rotation(30)  

    for i in range(len(short_y_col)):
        for j in range(len(titles)):
            text = ax.text(i + 0.5, j + 0.5, data[k][j, i], ha="center", va="center", color="w")

    fig.tight_layout()
    plt.savefig(DATA_PATH + 'slimmed/' + graph_titles[k])
    plt.close()

NameError: name 'c' is not defined

In [45]:
# Generate graphs showing how each variable changes over time with regards to each of the survey measures
correlations = []
pvalues = []

feature_sets = ['all demos', 'bios + demos', 'mfcc1-12 + demos', 'mfcc1-12', 'pauses', 'mfccs + pauses', 'jitter, shimmer', 'jitter, shimmer, pauses', 'mfccs + jitter, shimmer', 'mfccs + jitter shimmer + pauses', 'all bio', 'bio + jitter, shimmer', 'bio + jitter, shimmer + pauses', 'bio + pauses', 'bio + mfcc', 'everything']
demo_feature_set = ['mfcc1-12', 'pauses', 'mfccs + pauses', 'jitter, shimmer', 'jitter, shimmer, pauses', 'mfccs + jitter, shimmer', 'mfccs + jitter shimmer + pauses', 'all bio', 'bio + jitter, shimmer', 'bio + jitter, shimmer + pauses', 'bio + pauses', 'bio + mfcc', 'everything']

for i in range(2, 9):
    s = get_slopes(all_data, 1, i)
    c, p, titles = generate_combos(s)
    correlations.append(c)
    pvalues.append(p)

In [46]:
fig = plt.figure(figsize=[15,40])
fig.subplots_adjust(hspace=0.45, wspace=0.3)
for i in range(16):
    feature = [corr_set[i] for corr_set in correlations]
    cai_st = np.asarray([vals[0] for vals in feature])
    cai_f = np.asarray([vals[1] for vals in feature])
    stai_t = np.asarray([vals[2] for vals in feature])
    num_trials = np.arange(2, 9)
    
    ax = fig.add_subplot(8, 2, i+1)
    ax.set_ylim(-0.8, 0.8)
    ax.plot(num_trials, cai_st, label='CAI St')
    ax.plot(num_trials, cai_f, label='CAI Full')
    ax.plot(num_trials, stai_t, label='STAI T')
    plt.ylabel('Correlation')
    plt.xlabel('Number of trials')
    plt.title(feature_sets[i])
    ax.legend(loc="upper right")

#plt.show()
plt.savefig(DATA_PATH + 'slimmed/' + 'change-with-trials')
plt.close()

In [30]:
#c, p, titles = generate_combos(all_slopes)
#generate_graphs(c, p, titles, "general")
demo_columns = [all_slopes.columns[36], all_slopes.columns[38], all_slopes.columns[41], all_slopes.columns[43]]
#demo_columns = [36, 38, 41, 43]

for i in range(2, 9):
    #for col in demo_columns:
    graph_title = 'upto' + str(i)
    s = get_slopes(all_data, 1, i)
    c, p, titles = generate_combos(s)
    generate_graphs(c, p, titles, graph_title)

In [35]:
#all_slopes = get_slopes(all_data, 1, 8)
c, p, titles = generate_combos(all_slopes)
generate_graphs(c, p, titles, "general")