In [1]:
import pandas as pd
import numpy as np
import math
from copy import deepcopy


#############################
# Import and combine all data
#############################


DATA_PATH = "/Users/mvonebers/HUBBS-Lab/data/"
#DATA_PATH = "/home/maggie/HUBBS-Lab/data/"

e4_data = pd.read_excel(DATA_PATH + "E4_TEST.xlsx")
change_data = pd.read_excel(DATA_PATH + "normalized_change.xlsx")
audio_data = pd.read_excel(DATA_PATH + "audio_TEST.xlsx")
demo_data = pd.read_csv(DATA_PATH + "Demographics Information.csv")


# Break apart the ID column into "person" and "trial"
def clean_id(data):
    data.insert(0, "person", [0] * data.shape[0])
    data.insert(1, "trial", [0] * data.shape[0])

    for i, row in data.iterrows():
        data.at[i, "person"] = int(data.at[i, "id"][7:])
        data.at[i, "trial"] = int(data.at[i, "id"][5])
    
    data = data.drop(columns=['id'])
    data = data.rename(columns={"person": "id"})
    return data

    
e4_data = clean_id(e4_data)
audio_data = clean_id(audio_data)    
    
all_data = pd.merge(e4_data, change_data, on='id')
all_data = audio_data.merge(all_data, how='right')


# Reorder survey data in order of most samples to least
columns = all_data.columns.to_list()
new_columns = deepcopy(columns)
new_columns[35] = columns[40]
new_columns[37] = columns[41] 
new_columns[38] = columns[37] 
new_columns[40] = columns[35]
new_columns[41] = columns[38]

all_data = all_data[new_columns]


# what does this do?
#demo_ids = demo_data['id'].to_list()
#
#for id_ in demo_ids:
#    if id_ not in slope_ids:
#        demo_data = demo_data[demo_data.id != id_]
        
all_data = all_data.merge(demo_data, how="right")

# The demographic data lists IDs that aren't present in the other data, so remove them
all_data = all_data[all_data.id != 16]
all_data = all_data[all_data.id != 27]
all_data = all_data[all_data.id != 38]
all_data = all_data[all_data.id != 43]
all_data = all_data[all_data.id != 46]
all_data = all_data[all_data.id != 49]
all_data = all_data[all_data.id != 53]
all_data = all_data[all_data.id != 58]
all_data = all_data[all_data.id != 65]
all_data = all_data[all_data.id != 66]

all_data.drop(['CAI Trait Small group Score', 'CAI Trait Dyadic Score', 'CAI Trait Public Speaking Score', 'STAI State Score', 'Brief fear of Negative Evaluation'], axis=1, inplace=True)
all_data['CAI State Score']

0     -0.179487
1     -0.179487
2     -0.179487
3     -0.179487
4     -0.179487
         ...   
147    0.178571
148    0.178571
149    0.178571
150    0.178571
151    0.178571
Name: CAI State Score, Length: 152, dtype: float64

In [None]:
###############################################################
# Get slopes from linear regression of the 8 trials for each ID
###############################################################

from sklearn.linear_model import LinearRegression

def get_slopes(data, start, end):
    y0 = data['trial'].to_numpy(copy=True)

    slopes = pd.DataFrame(np.zeros((19, 45)), columns=data.columns)
    slopes = slopes.drop(["trial"], axis=1)

    for col in range(2, 34):
        x1 = data[data.columns[col]]
        y0 = list(range(start, end + 1))
        for row in range(19):
            x0 = x1[ (row * 8) + start - 1 : (row * 8) + end ].to_numpy()
            x = np.array([])
            y = np.array([])
            
            slopes.iloc[row, 0] = data.iloc[row * 8, 0]

            for i in range(len(x0)):  # remove NaN from data
                if not math.isnan(x0[i]) and not math.isnan(y0[i]):
                    x = np.append(x, x0[i])
                    y = np.append(y, y0[i])
            
            try:
                reg = LinearRegression().fit(y.reshape(-1,1),x)
                slopes.iloc[row, col - 1] = reg.coef_
            except:
                 slopes.iloc[row, col - 1] = 0
    
    for col in range(34, 45):
        for row in range(19):
            slopes.iloc[row, col - 1] = data.iloc[row * 8, col]
            
    # Want to preserve zeros in the demographic data, so temporarily boost it up one...
    for col in range(43, 45):
        for row in range(19):
            slopes.iloc[row, col - 1] += 1.0
                    
    slopes.replace(0, np.NaN, inplace=True)
    
    # Then bump it back down. (I know this is a dumb way to do this.)
    for col in range(43, 45):
        for row in range(19):
            slopes.iloc[row, col - 1] -= 1.0
    
    return slopes

all_slopes = get_slopes(all_data, 1, 8)
slope_ids = all_slopes['id'].to_list()
#slope_ids
all_slopes

print(all_slopes['ethnicity'])

In [3]:
#from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_predict
from scipy.stats import pearsonr

# Problem: CAI State values are showing up as the CAI Dyadic...

y_columns = ['CAI State Score', 'CAI Trait Full Score', 'STAI Trait Score']


def get_combo_predictions(X0, slope_data):
    corrs = []
    ps = []
    for y_col, y_i in zip(y_columns, range(len(y_columns))):
        y0 = slope_data[y_col].to_numpy(copy=True)
        X = np.array([X0[0]])
        y = np.array(y0[0])
        
        for i in range(1, len(X0)):  # remove NaN from data
            is_nan = False
            for x in X0[i]:
                if math.isnan(x):
                    is_nan = True
                    break
            if not math.isnan(y0[i]) and not is_nan:
                X = np.append(X, [X0[i]], axis=0)
                y = np.append(y, y0[i])
        
        
        folds = min(10, len(X))
        model = LinearRegression()
        cv = KFold(folds, shuffle=True, random_state=42)
        predicted_vals0 = cross_val_predict(model, X, y, cv=cv)
        actual_vals0 = slope_data[y_col].to_numpy(copy=True)
        #TODO: why this for actual_vals0 isntead of the sanitized y we already got?
        predicted_vals = []
        actual_vals = []
        

        for j in range(len(predicted_vals0)):
            if not math.isnan(predicted_vals0[j]) and not math.isnan(actual_vals0[j]):
                predicted_vals.append(predicted_vals0[j])
                actual_vals.append(actual_vals0[j])

        correlation, pval = pearsonr(predicted_vals, actual_vals)
        #to_print = str(correlation) + str(pval)
        #if pval < 0.15: 
        #    if not folds == 10:
        #        print("With # KFolds", folds)
        #    print("{0}:\t\t{1}\t\t{2}\t{3}\t\t{4}".format(y_col, correlation, pval, "Rows:", len(y)))
        corrs.append(correlation)
        ps.append(pval)
        
    return corrs, ps

def get_predictions(X0, slope_data):
    corrs = []
    ps = []
    for y_col, y_i in zip(y_columns, range(len(y_columns))):
        y0 = slope_data[y_col].to_numpy(copy=True)
        X = np.array(X0[0])
        y = np.array(y0[0])
        
        for i in range(1,len(X0)):  # remove NaN from data
            if math.isnan(y0[i]) or math.isnan(X0[i]):
                continue
            else:
                X = np.append(X, X0[i])
                y = np.append(y, y0[i])
            
        X = X.reshape(-1, 1)
        folds = min(10, len(X))
        model = LinearRegression()
        cv = KFold(folds, shuffle=True, random_state=42)
        predicted_vals0 = cross_val_predict(model, X, y, cv=cv)
        actual_vals0 = slope_data[y_col].to_numpy(copy=True)
        predicted_vals = []
        actual_vals = []
        

        for j in range(len(predicted_vals0)):
            if not math.isnan(predicted_vals0[j]) and not math.isnan(actual_vals0[j]):
                predicted_vals.append(predicted_vals0[j])
                actual_vals.append(actual_vals0[j])

        correlation, pval = pearsonr(predicted_vals, actual_vals)
        corrs.append(correlation)
        ps.append(pval)
        
    return corrs, ps
      
def generate_combos(slope_data):
    group_c = []
    group_p = []
    group_titles = []
    
    demo_columns = [slope_data.columns[36], slope_data.columns[38], slope_data.columns[41], slope_data.columns[43]]
    
    group_titles.append('all demos')
    X0_demo = slope_data[demo_columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_demo, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append("bios + demos")
    columns = slope_data.columns[23:30].to_list() + demo_columns
    X0 = slope_data[columns].to_numpy(copy=True)
    c, p = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append('mfcc1-12 + demos')
    columns = slope_data.columns[2:14].to_list() + demo_columns
    X0 = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('mfcc1-12')
    X0_mfcc = slope_data[slope_data.columns[2:14]].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('pauses')
    X0_pauses = slope_data[slope_data.columns[17:20]].to_numpy(copy=True) # #pause, pause_frequency, pause_interval
    c, p = get_combo_predictions(X0_pauses, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('mfccs + pauses')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[17:20].to_list()
    X0_pauses = slope_data[columns].to_numpy(copy=True) # #pause, pause_frequency, pause_interval
    c, p = get_combo_predictions(X0_pauses, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('jitter, shimmer')
    X0_jitter = slope_data[slope_data.columns[22:24]].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append('jitter, shimmer, pauses')
    columns = slope_data.columns[22:24].to_list() + slope_data.columns[17:20].to_list()
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('mfccs + jitter, shimmer')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[22:24].to_list()
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append('mfccs + jitter shimmer + pauses')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[22:24].to_list() + slope_data.columns[17:20].to_list()
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append("all bio")
    X0_eda = slope_data[slope_data.columns[23:30]].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_eda, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('bio + mfcc')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[23:30].to_list()
    X0_bio_mfcc = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_bio_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append('all of em')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[23:30].to_list() + slope_data.columns[22:24].to_list() + slope_data.columns[17:20].to_list() + demo_columns
    X0 = slope_data[columns].to_numpy(copy=True)
    c, p = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    return group_c, group_p, group_titles


def combos_with_demo(slope_data, demo_col):
    group_c = []
    group_p = []
    group_titles = []
    
    #demo_columns = [slope_data.columns[36], slope_data.columns[38], slope_data.columns[41], slope_data.columns[43]]

    group_titles.append('mfcc1-12')
    columns = slope_data.columns[2:14].to_list() + [demo_col]
    X0_mfcc = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('pauses')
    columns = slope_data.columns[17:20].to_list() + [demo_col]
    X0_pauses = slope_data[columns].to_numpy(copy=True) # #pause, pause_frequency, pause_interval
    c, p = get_combo_predictions(X0_pauses, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('mfccs + pauses')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[17:20].to_list() + [demo_col]
    X0_pauses = slope_data[columns].to_numpy(copy=True) # #pause, pause_frequency, pause_interval
    c, p = get_combo_predictions(X0_pauses, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('jitter, shimmer')
    columns = slope_data.columns[22:24].to_list() + [demo_col]
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append('jitter, shimmer, pauses')
    columns = slope_data.columns[22:24].to_list() + slope_data.columns[17:20].to_list() + [demo_col]
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('mfccs + jitter, shimmer')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[22:24].to_list() + [demo_col]
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append('mfccs + jitter shimmer + pauses')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[22:24].to_list() + slope_data.columns[17:20].to_list() + [demo_col]
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append("all bio")
    columns = slope_data.columns[23:30].to_list() + [demo_col]
    X0_eda = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_eda, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('bio + mfcc')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[23:30].to_list() + [demo_col]
    X0_bio_mfcc = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_bio_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    group_titles.append('all of em')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[23:30].to_list() + slope_data.columns[22:24].to_list() + slope_data.columns[17:20].to_list() + [demo_col]
    X0 = slope_data[columns].to_numpy(copy=True)
    c, p = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    return group_c, group_p, group_titles

In [4]:
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

def generate_graphs(c, p, titles, specifier):
    short_y_col = ["CAI St(19)", "CAI F(18)" , "STAI T(17)"]
    correlations = np.matrix(c)
    pvalues = np.matrix(p)

    correlations = np.round(correlations, decimals=2)
    pvalues = np.round(pvalues, decimals=2)
    
    data = [correlations, pvalues]
    graph_titles = ["correlations_" + specifier + ".png", "pvalues_" + specifier + ".png"]
    
    for k in range(2):
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20,85))
        
        if k == 0:
            p = ax.pcolor(data[k], vmin=-0.5, vmax=0.8)
        else:
            p = ax.pcolor(data[k], vmin = 0.0, vmax = 1.0)
        #p = ax.matshow(data[k])
        fig.colorbar(p, ax=ax, fraction=0.05, pad=0.04)
        ax.set_xticklabels(labels=short_y_col)
        ax.set_yticklabels(labels=titles)
        plt.yticks(np.arange(0, len(titles), 1.0))
        for tick in ax.get_xticklabels():
            tick.set_rotation(30)  

        for i in range(len(short_y_col)):
            for j in range(len(titles)):
                text = ax.text(i + 0.5, j + 0.5, data[k][j, i], ha="center", va="center", color="w")

        fig.tight_layout()
        plt.savefig(DATA_PATH + 'slimmed/' + graph_titles[k])
        plt.close()

In [14]:
def generate_combos_with_demo(slope_data, demo_col):
    group_c = []
    group_p = []
    group_titles = []
    
    group_titles.append('voiceProb_sma_amean')
    X0 = slopes_data['voiceProb_sma_amean'].to_numpy(copy=True)
    c, p = get_combo_predictions(X0, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('mfcc1-12')
    columns = slope_data.columns[2:14].to_list() + [demo_col]
    X0_mfcc = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('pauses')
    columns = slope_data.columns[17:20].to_list() + [demo_col]
    X0_pauses = slope_data[columns].to_numpy(copy=True) # #pause, pause_frequency, pause_interval
    c, p = get_combo_predictions(X0_pauses, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('mfccs + pauses')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[17:20].to_list() + [demo_col]
    X0_pauses = slope_data[columns].to_numpy(copy=True) # #pause, pause_frequency, pause_interval
    c, p = get_combo_predictions(X0_pauses, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('jitter, shimmer')
    columns = slope_data.columns[22:24].to_list() + [demo_col]
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('mfccs+ jitter, shimmer')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[22:24].to_list() + [demo_col]
    X0_jitter = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_jitter, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append("all bio")
    columns = slope_data.columns[23:30].to_list() + [demo_col]
    X0_eda = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_eda, slope_data)
    group_c.append(c)
    group_p.append(p)

    group_titles.append('bio + mfcc')
    columns = slope_data.columns[2:14].to_list() + slope_data.columns[23:30].to_list() + [demo_col]
    X0_bio_mfcc = slope_data[columns].to_numpy(copy=True)
    c,p = get_combo_predictions(X0_bio_mfcc, slope_data)
    group_c.append(c)
    group_p.append(p)
    
    #return group_c, group_p, group_titles

    # All mfcc's combined with other attributes
    for i in range(14, 25):
        title = "mfccs + " + str(slope_data.columns[i])
        group_titles.append(title)
        columns = slope_data.columns[2:14].to_list() + [slope_data.columns[i]] + [demo_col]
        X0 = slope_data[columns].to_numpy(copy=True)
        c,p = get_combo_predictions(X0, slope_data)
        group_c.append(c)
        group_p.append(p)

    for i in range(14, 25):
        col1 = slope_data.columns[i]
        for j in range(i + 1, 25):
            col2 = slope_data.columns[j]
            title = str(col1) + '+' + str(col2)
            group_titles.append(title)
            X0 = slope_data[[col1, col2, demo_col]].to_numpy(copy=True)
            c, p = get_combo_predictions(X0, slope_data)
            group_c.append(c)
            group_p.append(p)

    for i in range(14, 25):
        col1 = slope_data.columns[i]
        for j in range(i + 1, 25):
            col2 = slope_data.columns[j]
            columns = slope_data.columns[2:14].to_list() + [col1] + [col2] + [demo_col]
            title = "mfccs + " + str(col1) + '+' + str(col2)
            group_titles.append(title)
            X0 = slope_data[[col1, col2]].to_numpy(copy=True)
            c, p = get_combo_predictions(X0, slope_data)
            group_c.append(c)
            group_p.append(p)

    for i in range(14, 24):
        col = slope_data.columns[i]
        title = 'bio +' + str(col)
        group_titles.append(title)
        columns = slope_data.columns[23:30].to_list() + [col] + [demo_col]
        X0_eda = slope_data[columns].to_numpy(copy=True)
        c,p = get_combo_predictions(X0_eda, slope_data)
        group_c.append(c)
        group_p.append(p)

    for i in range(14, 24):
        col = slope_data.columns[i]
        title = 'mfcc + bio +' + str(col)
        group_titles.append(title)
        columns = slope_data.columns[2:14].to_list() + slope_data.columns[23:30].to_list() + [col] + [demo_col]
        X0_eda = slope_data[columns].to_numpy(copy=True)
        c,p = get_combo_predictions(X0_eda, slope_data)
        group_c.append(c)
        group_p.append(p)
        
    return group_c, group_p, group_titles

In [8]:
for i in range(36,44):
    for j in range(i+1, 44):
        c, p, titles = combos_with_demo(all_slopes, all_slopes.columns[i], all_slopes.columns[j])
        filename = "with-" + str(all_slopes.columns[i]) + '-' + str(all_slopes.columns[j])
        generate_graphs(c, p, titles, filename)

In [15]:
c1, p1, titles1 = combos_with_demo(all_slopes, "highest_education")
#generate_graphs(c, p, titles, "with-highestedu")
c2, p2, titles2 = combos_with_demo(all_slopes, "highest_education")
#generate_graphs(c, p, titles, "with-highestedu")
c3, p3, titles3 = combos_with_demo(all_slopes, "age")
#generate_graphs(c, p, titles, "with-highestedu")


short_y_col = ["CAI St(19)", "CAI F(18)" , "STAI T(17)"]
correlations = np.matrix(c)
pvalues = np.matrix(p)

correlations = np.round(correlations, decimals=2)
pvalues = np.round(pvalues, decimals=2)
    
data = [correlations, pvalues]
graph_titles = ["correlations_" + specifier + ".png", "pvalues_" + specifier + ".png"]
    
for k in range(2):
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20,85))
        
    if k == 0:
        p = ax.pcolor(data[k], vmin=-0.5, vmax=0.8)
    else:
        p = ax.pcolor(data[k], vmin = 0.0, vmax = 1.0)
    #p = ax.matshow(data[k])
    fig.colorbar(p, ax=ax, fraction=0.05, pad=0.04)
    ax.set_xticklabels(labels=short_y_col)
    ax.set_yticklabels(labels=titles)
    plt.yticks(np.arange(0, len(titles), 1.0))
    for tick in ax.get_xticklabels():
        tick.set_rotation(30)  

    for i in range(len(short_y_col)):
        for j in range(len(titles)):
            text = ax.text(i + 0.5, j + 0.5, data[k][j, i], ha="center", va="center", color="w")

    fig.tight_layout()
    plt.savefig(DATA_PATH + 'slimmed/' + graph_titles[k])
    plt.close()