In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
demographics = os.path.join('') #directory with all demographic information

In [None]:
df = pd.read_csv('') # file containing dem information such as gender, diagnoses, medication etc.

In [None]:
#files containing the subject ids for controls and patients
control = np.loadtxt(os.path.join(demographics, 'control_subs.txt'), dtype= str)
sz = np.loadtxt(os.path.join(demographics, 'patient_subs.txt'), dtype= str)

#### Get average duration of the interviews

In [None]:
import scipy.io.wavfile as wav
import glob

durs = []

for file in sorted(glob.glob('/audio/split_channels/' + '*.wav')):
    
    if 'ch1' in file:
        continue
    else:
        (source_rate, source_sig) = wav.read(file)
        duration_seconds = len(source_sig) / float(source_rate)
        durs.append(duration_seconds)

In [None]:
print(' min dur: ', min(durs) / 60,
      '\n max dur: ', max(durs)  / 60,
      '\n mean dur: ', np.array(durs).mean() / 60,
      '\n sd dur: ', np.array(durs).std() / 60)

In [None]:
import scipy.stats as stats

#### Compare and print Age

In [None]:
#equal_var = False for Welch's t-test
t_age, p_age = stats.ttest_ind(df.loc[control]['Age'], df.loc[sz]['Age'], equal_var= False, nan_policy = 'omit')
print(t_age, p_age)

#### Compare and print Gender

In [None]:
gender_cntrl = pd.read_csv(demographics + '/genders_controls.csv', index_col= ['Praatnummer'])
gender_sz = pd.read_csv(demographics + '/genders_sz.csv', index_col= ['Praatnummer'])

In [None]:
from collections import Counter

In [None]:
cntrl_fm = Counter(gender_cntrl['Geslacht'].values)
cntrl_fm = sorted(cntrl_fm.items(), key=lambda item: (-item[1], item[0]))

sz_fm = Counter(gender_sz['Geslacht'].values)
sz_fm = sorted(sz_fm.items(), key=lambda item: (-item[1], item[0]))

In [None]:
print(cntrl_fm, sz_fm)

In [None]:
gender_control_prct = [('m', 66), ('f', 34)]
gender_sz_prct = [('m', 61), ('f', 39)]

In [None]:
#contingeny matrix for chi-square test
contingency = [[60, 49],
               [31,  31]]

In [None]:
chi2, chi2_p, dof, expected = stats.chi2_contingency(contingency, correction = False)
print(chi2, chi2_p, dof, expected)

#### Compare and print Years of Education (parental and personal)

In [None]:
yoe_parents_cntrl = df.loc[control]['YOE_mean_ouders'].replace(999.0, np.nan)
yoe_parents_sz = df.loc[sz]['YOE_mean_ouders'].replace(999.0, np.nan)

U, p_yoe_parents = stats.mannwhitneyu(yoe_parents_cntrl, yoe_parents_sz)
print(U, p_yoe_parents)

In [None]:
YOE_sz = df.loc[sz]['YOE_handmatig'].replace(999.0, np.nan)
YOE_cntrl = df.loc[control]['YOE_handmatig'].replace(999.0, np.nan)

In [None]:
U, p_yoe_self = stats.mannwhitneyu(YOE_cntrl, YOE_sz)
print(U, p_yoe_self)

#### Convert results into latex table

In [None]:
from pylatex import (
        Document,
        Section,
        Tabular,
        Table,
        NoEscape,
        Package,
        Command,
        MultiRow,
        MultiColumn,
    )

In [None]:
from pylatex.utils import italic, bold

In [None]:
doc = Document(
    geometry_options=["margin=3.0cm"], page_numbers=False
)
# Use siunitx to get 4 siginifcant digits and nice number formatting
doc.packages.append(Package("multirow"))
doc.packages.append(Package("multicol"))
doc.packages.append(Package("siunitx"))
doc.preamble.append(
    Command(
        "sisetup",
        NoEscape(
            "round-mode = figures, round-precision = 2, scientific-notation = false, separate-uncertainty = true"
        ),
    )
)

In [None]:
def formatNumbers(value, sci_notation = 'false'):
    
    new_value = r'\num[scientific-notation=' + sci_notation + ']{' + str(value) + '}'
    
    return new_value

In [None]:
table_holder = Table(position="tbp")
# Create the tabular env
table = Tabular("lcccccc")
table.add_row(
    '', MultiColumn(6, align='c', data='')
)
table.add_hline()

table.add_row('', 'Controls', '', 'Patients', '', 'Statistic', 'p')

table.add_hline(4)

table.add_row('', f"(n = {len(control)})", '', 
                  f"(n = {len(sz)})", '', '', '')

table.add_hline()

table.add_row('Gender (% Female)', 
              gender_control_prct[0][1], '',
              gender_sz_prct[0][1], '', 
              NoEscape('$\chi^2$') + '(' + NoEscape(formatNumbers(chi2)) + ')', 
              NoEscape(formatNumbers(chi2_p)))

table.add_row('Age (mean)', 
              NoEscape(formatNumbers(df.loc[control]['Age'].mean())), '',
              NoEscape(formatNumbers(df.loc[sz]['Age'].mean())), '',
              'T (' + NoEscape(formatNumbers(t_age)) + ')', NoEscape(formatNumbers(p_age)))
              
              
table.add_row('Years of Education - parental (mean)', 
              NoEscape(formatNumbers(yoe_parents_cntrl.mean())), '', 
              NoEscape(formatNumbers(yoe_parents_sz.mean())), '', 
              'MWU', 
              NoEscape(formatNumbers(p_yoe_parents)))
              
table.add_row('Years of Education (mean)', 
              NoEscape(formatNumbers(df.loc[control]['YOE_handmatig'].mean())), '',
              NoEscape(formatNumbers(YOE_sz.mean())), '', 
              'MWU',
              NoEscape(formatNumbers(p_yoe_self)))

table.add_hline()

table_holder.append(table)
table_holder.append(NoEscape(r"\flushright"))  # Horizontally center on page

table_holder.add_caption(NoEscape(' '))
doc.append(table_holder)

doc.generate_pdf('', clean_tex=False)