In [None]:
import numpy as np
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt

In [None]:
os_sep = os.path.abspath(os.sep)
wd = os.getcwd()
dfs = os.path.join(os_sep, wd, 'opensmile', 'egemaps_summary_turns_zero_filtered') #the feature dfs of the interviews
dem_dir = os.path.join(os_sep, 'dem_dir') #where to find the txt files with the group information of each participant

In [None]:
pauses = 'MeanUnvoicedSegmentLength'
syll_rate = 'VoicedSegmentsPerSec'
pitch = 'F0semitoneFrom27.5Hz_sma3nz_amean'
loudness = 'loudness_sma3_amean'
pitch_var = 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm'

features = [pauses, syll_rate, pitch, loudness, pitch_var]

#### Split each interview in conversation halves

For each of the loaded dataframes, split the conversation in first and second half, separately for each speaker since we correlate the first and second part individually.

In [None]:
ch1_first_half = []
ch1_second_half = []

ch2_first_half = []
ch2_second_half = []

for file in sorted(glob.glob(dfs + '/*.csv')):
    
    df = pd.read_csv(file, sep = ';', index_col= [0])
    
    first_half, second_half = np.array_split(df.index, 2)
    
    if 'ch1' in file:
        ch1_first_half.append(df.loc[first_half])
        ch1_second_half.append(df.loc[second_half])
        
    else:
        ch2_first_half.append(df.loc[first_half])
        ch2_second_half.append(df.loc[second_half])

In [None]:
def calculateSynchronyFromDF(ch1_dfs, ch2_dfs, features):

    import pandas as pd
    import scipy.stats as stats
    
    #ToDo: fix later with loop
    feature_rows = {'MeanUnvoicedSegmentLength' : [],
                     'VoicedSegmentsPerSec' : [],
                     'F0semitoneFrom27.5Hz_sma3nz_amean' : [],
                     'loudness_sma3_amean' : [],
                     'F0semitoneFrom27.5Hz_sma3nz_stddevNorm' : []}
    
    for ch1, ch2 in zip(ch1_dfs, ch2_dfs):
        
        sub_id = ch1['sub_id'].unique()[0]

        for feature in features:

            speaker_1 = ch1[feature].to_numpy()
            speaker_2 = ch2[feature].to_numpy()
            
            #sometimes turns will be unequal, in that case drop the last one from the array
            if len(speaker_1) > len(speaker_2):
                speaker_1 = speaker_1[:-1]
                
            elif len(speaker_1) < len(speaker_2):
                speaker_2 = speaker_2[:-1]
                
            speaker_1 = speaker_1[~np.isnan(speaker_2)]  #drop nan turns from ch2 also from ch1  
            speaker_2 = speaker_2[~np.isnan(speaker_2)]
    
            x = speaker_1[~np.isnan(speaker_1)] #drop nan turns from ch1 also from ch2  
            y = speaker_2[~np.isnan(speaker_1)]
            
            #calculate synchrony using spearman r
            r, p = stats.spearmanr(x, y)
            
            #transform to z scores
            r_z = np.arctanh(r)
            
            #create dictionary with all the information
            row = {'soundname': sub_id,
                   'r': r, 
                   'p': p, 
                   'r_z': r_z}
        
            feature_rows[feature] += [row]

        
    return feature_rows

#### Calculate speech accommodation for the first and the second halves of the interview

In [None]:
feature_rows_first_half = calculateSynchronyFromDF(ch1_first_half, ch2_first_half, features)
feature_rows_second_half = calculateSynchronyFromDF(ch1_second_half, ch2_second_half, features)

In [None]:
summary_dfs_first_half = {}
summary_dfs_second_half = {}

for feature, rows in feature_rows_first_half.items():
    
    summary_dfs_first_half[feature] =  pd.DataFrame(rows)
    
for feature, rows in feature_rows_second_half.items():
    
    summary_dfs_second_half[feature] =  pd.DataFrame(rows)

#### Load the group splits and compare halves of healthy controls and SZ patients separately

In [None]:
controls = np.loadtxt(os.path.join(dem_dir, 'control_subs.txt'), dtype= str)
patients = np.loadtxt(os.path.join(dem_dir, 'patient_subs.txt'), dtype= str)

In [None]:
def getGroupIndices(df, group):
    
    group_indices = [k for k in df['soundname'] if k[:4] in group]
    
    return group_indices

In [None]:
def pairedTestPerFeature(features, dfs_condition1, dfs_condition2, group):
    
    import scipy.stats as stats

    rows = {}
    
    for feature in features:

        row = {}

        cond1 = dfs_condition1[feature]
        cond2 = dfs_condition2[feature]

        idxs_group = getGroupIndices(cond1, group) #the matching group subjects in the dataframe

        x = cond1[cond1['soundname'].isin(idxs_group)]['r_z']   #select converted r value
        y = cond2[cond2['soundname'].isin(idxs_group)]['r_z']  

        #paired ttest!
        t, p = stats.ttest_rel(x, y)

        row['T'] = t
        row['p'] = p

        rows[feature] = row

    df = pd.DataFrame(rows)

    return df.T

In [None]:
t_df_controls = pairedTestPerFeature(features, summary_dfs_first_half, summary_dfs_second_half, controls)
t_df_patients = pairedTestPerFeature(features, summary_dfs_first_half, summary_dfs_second_half, patients)

#### Print results

In [None]:
t_df_controls

In [None]:
t_df_patients

#### Repeat same process with conversation thirds instead of halves

In [None]:
ch1_first = []
ch1_second = []
ch1_third = []

ch2_first = []
ch2_second = []
ch2_third = []


for file in sorted(glob.glob(dfs + '/*.csv')):
    
    df = pd.read_csv(file, sep = ';', index_col= [0])
    
    first, second, third  = np.array_split(df.index, 3)
    
    if 'ch1' in file:
        ch1_first.append(df.loc[first])
        ch1_second.append(df.loc[second])
        ch1_third.append(df.loc[third])
        
    else:
        ch2_first.append(df.loc[first])
        ch2_second.append(df.loc[second])
        ch2_third.append(df.loc[third])

#### Calculate speech accommodation for each third of the interview

In [None]:
feature_rows_first = calculateSynchronyFromDF(ch1_first, ch2_first, features)
feature_rows_second = calculateSynchronyFromDF(ch1_second, ch2_second, features)
feature_rows_third = calculateSynchronyFromDF(ch1_third, ch2_third, features)

In [None]:
def makeDFsFromDict(feature_dict):
    
    import pandas as pd
    
    summary_dfs = {}
    
    for feature, rows in feature_dict.items():
    
        summary_dfs[feature] =  pd.DataFrame(rows)
    
    return summary_dfs

In [None]:
summary_dfs_first = makeDFsFromDict(feature_rows_first)
summary_dfs_second = makeDFsFromDict(feature_rows_second)
summary_dfs_third = makeDFsFromDict(feature_rows_third)

#### Perform t-tests between the first and second and the second and third conversation part

In [None]:
first_vs_second_cntrl = pairedTestPerFeature(features, summary_dfs_first, summary_dfs_second, controls)
second_vs_third_cntrl = pairedTestPerFeature(features, summary_dfs_second, summary_dfs_third, controls)


first_vs_second_sz = pairedTestPerFeature(features, summary_dfs_first, summary_dfs_second, patients)
second_vs_third_sz = pairedTestPerFeature(features, summary_dfs_second, summary_dfs_third, patients)

#### print results

In [None]:
first_vs_second_cntrl

In [None]:
second_vs_third_cntrl

In [None]:
first_vs_second_sz

In [None]:
second_vs_third_sz

#### Plot the results

To make use of seaborn's high level integration of dataframes we reshape the data a bit and take the mean for each conversation part, so that all values from all channels, features and time points are in one dataframe. For that we merge the individual dfs for each interview and add a few columns with extra information for conversation halves and thirds respectively.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
ch1_first_mean = []
ch2_first_mean = []

ch1_second_mean = []
ch2_second_mean = []

#for all interviews, take the mean
for ch1, ch2 in zip(ch1_first_half, ch2_first_half):
    
    sub_id = pd.Series(data = ch1['sub_id'].unique()[0])
                       
    ch1_first_mean.append(ch1.mean().append(sub_id))
    ch2_first_mean.append(ch2.mean().append(sub_id))
    
for ch1, ch2 in zip(ch1_second_half, ch2_second_half):
                       
    sub_id = pd.Series(data = ch1['sub_id'].unique()[0])
                   
    ch1_second_mean.append(ch1.mean().append(sub_id))
    ch2_second_mean.append(ch2.mean().append(sub_id))

In [None]:
#merge all interviews in one dataframe
ch1_first_df = pd.DataFrame(ch1_first_mean)
ch2_first_df = pd.DataFrame(ch2_first_mean)

ch1_second_df = pd.DataFrame(ch1_second_mean)
ch2_second_df = pd.DataFrame(ch2_second_mean)

In [None]:
#merge the first and second half, add information which part each conversation belongs to
ch1_first_df['time'] = '1/2'
ch2_first_df['time'] = '1/2'

ch1_second_df['time'] = '2/2'
ch2_second_df['time'] = '2/2'

ch1 = pd.concat([ch1_first_df, ch1_second_df])
ch2 = pd.concat([ch2_first_df, ch2_second_df])

In [None]:
ch1['channel'] = 'Interviewer'
ch2['channel'] = 'Participant'

conversation_halves = pd.concat([ch1, ch2])

In [None]:
conversation_halves

In [None]:
ch1_first_mean = []
ch1_second_mean = []
ch1_third_mean = []

ch2_first_mean = []
ch2_second_mean = []
ch2_third_mean = []

#for each interview, take the mean of the first, second and third part
for ch1, ch2 in zip(ch1_first, ch2_first):
    
    sub_id = pd.Series(data = ch1['sub_id'].unique()[0])
    
    ch1_first_mean.append(ch1.mean().append(sub_id))
    ch2_first_mean.append(ch2.mean().append(sub_id))
    
for ch1, ch2 in zip(ch1_second, ch2_second):
    
    sub_id = pd.Series(data = ch1['sub_id'].unique()[0])
    
    ch1_second_mean.append(ch1.mean().append(sub_id))
    ch2_second_mean.append(ch2.mean().append(sub_id))

for ch1, ch2 in zip(ch1_third, ch2_third):
    
    sub_id = pd.Series(data = ch1['sub_id'].unique()[0])

    ch1_third_mean.append(ch1.mean().append(sub_id))
    ch2_third_mean.append(ch2.mean().append(sub_id))   

In [None]:
#merge all interviews into a dataframe
ch1_first_df = pd.DataFrame(ch1_first_mean)
ch2_first_df = pd.DataFrame(ch2_first_mean)

ch1_second_df = pd.DataFrame(ch1_second_mean)
ch2_second_df = pd.DataFrame(ch2_second_mean)

ch1_third_df = pd.DataFrame(ch1_third_mean)
ch2_third_df = pd.DataFrame(ch2_third_mean)

In [None]:
#concatenate all parts, add information which third the row belongs to
ch1_first_df['time'] = '1/3'
ch2_first_df['time'] = '1/3'

ch1_second_df['time'] = '2/3'
ch2_second_df['time'] = '2/3'

ch1_third_df['time'] = '3/3'
ch2_third_df['time'] = '3/3'

ch1 = pd.concat([ch1_first_df, ch1_second_df, ch1_third_df])
ch2 = pd.concat([ch2_first_df, ch2_second_df, ch2_third_df])

In [None]:
ch1['channel'] = 'Interviewer'
ch2['channel'] = 'Participant'

conversation_thirds = pd.concat([ch1, ch2])

In [None]:
conversation_halves

In [None]:
conversation_thirds

In [None]:
#filter the dataframes for controls and patients

conversation_halves_controls = conversation_halves.loc[conversation_halves[0].isin(controls)]
conversation_halves_patients = conversation_halves.loc[conversation_halves[0].isin(patients)]

conversation_thirds_controls = conversation_thirds.loc[conversation_thirds[0].isin(controls)]
conversation_thirds_patients = conversation_thirds.loc[conversation_thirds[0].isin(patients)]

#### Make a plot that shows the mean values for all speech features across the different time splits 

In [None]:
fig, axs = plt.subplots(nrows=len(features), ncols=2, figsize = (10, 20))
features_to_plot = sorted(features * 2)

#one column contains halves, the other thirds all other labels stay the same
y_labels = ['Pitch', 'Pitch', 
            'Pitch Variability', 'Pitch Variability', 
            'Average Pause Duration', 'Average Pause Duration',
            'Syllable Rate', 'Syllable Rate',
            'Loudness', 'Loudness']

#defining these manually so the y axes for halves and thirds are the same
y_lims = [(20, 35), (20, 35),
          (0.1, 0.2), (0.1, 0.2),
          (0, 0.6), (0, 0.6),
          (1.5, 6), (1.5, 6),
          (0.2, 0.8), (0.2, 0.8)]

#custom legend showing speaker and group attribution
legend_elements = [
                   Line2D([0], [0], marker='o', label='Interviewer', markerfacecolor='lightgrey', markersize=10, color = 'lightgrey'),
                   Line2D([0], [0], marker='x', label='Participant', markerfacecolor='grey', markersize=10, color = 'dimgrey'),
                   Line2D([0], [0], label='Control Group', linestyle = '--'),
                   Line2D([0], [0], label='Patient Group', color = 'red'),
                   ]

plt.suptitle('Average Speech Features Across the Interviews', fontsize = 15, y=1.0, x =0.45)

for i, ax in enumerate(axs.flatten()):
    
    #plot conversation halves on the left column
    if (i % 2) == 0:
        
        #plot interviewer + control participant
        sns.pointplot(x="time", y=features_to_plot[i], hue="channel",
                     capsize=.2, height=6, aspect=.75,
                     kind="point", data=conversation_halves_controls, palette = "Blues", ax = ax,
                    markers=["o", "x"], linestyles=["--", "--"])
        
        #plot interviewer + patient
        sns.pointplot(x="time", y=features_to_plot[i], hue="channel",
                     capsize=.2, height=6, aspect=.75,
                     kind="point", data=conversation_halves_patients, ax = ax, palette = 'Reds',
                     markers = ['o', 'x'])
        
        ax.get_legend().remove() #one legend per row
        ax.set_ylabel(y_labels[i], fontsize = 14)
        ax.set_xlabel('Conversation Halves', fontsize = 14)
        ax.set_ylim(y_lims[i])
        
    else:
        
        #plot interviewer + control participant
        sns.pointplot(x="time", y=features_to_plot[i], hue="channel",
                       capsize=.2, height=6, aspect=.75,
                       kind="point", data=conversation_thirds_controls, palette = "Blues", ax = ax,
                       markers=["o", "x"], linestyles=["--", "--"])
        
        #plot interviewer + patient
        sns.pointplot(x="time", y=features_to_plot[i], hue="channel",
                       capsize=.2, height=6, aspect=.75,
                       kind="point", data=conversation_thirds_patients, ax = ax, palette = 'Reds',
                       markers = ['o', 'x'])
        
        #add custom legend
        ax.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5), fontsize = 13)
        ax.set_ylabel('')
        ax.set_xlabel('Conversation Thirds', fontsize = 14)
        ax.set_ylim(y_lims[i])
        
        
        
plt.tight_layout()