In [None]:
import numpy as np
import sys
import os
import pandas as pd
sys.path.append('../../')

from script.Model_run import Speech_analysis

In [None]:
def monkey_patch_openpyxl():
    '''Openpyxl has a bug with workbooks that have wrong cell styling information.
    Monkey patch the library so it can handle these types of workbooks.'''
    from openpyxl.worksheet import _reader
    from openpyxl.cell import Cell
    def bind_cells(self):
        for idx, row in self.parser.parse():
            for cell in row:
                try:
                    style = self.ws.parent._cell_styles[cell['style_id']]
                except:  ## This is the patch, original doesn't have a try/except here
                    style = None
                c = Cell(self.ws, row=cell['row'], column=cell['column'], style_array=style)
                c._value = cell['value']
                c.data_type = cell['data_type']
                self.ws._cells[(cell['row'], cell['column'])] = c
        self.ws.formula_attributes = self.parser.array_formulae
        if self.ws._cells:
            self.ws._current_row = self.ws.max_row # use cells not row dimensions

    _reader.WorksheetReader.bind_cells = bind_cells

In [None]:
monkey_patch_openpyxl()

In [None]:
#Directory where the data are stored
data_home_dir = '/home/kvattis/Dropbox (Partners HealthCare)/Data_Raw_Biogen_SCA/'
#Most recent Patient_info file
Patient_info_recent = 'Subjects_BiogenSCA_with_questionnaires_2022_07_20_converted.xlsx'
#Location and name of the output file
outputfile_address = '/home/kvattis/Desktop/BiogenSca_outputs_2022_07_22.csv'

In [None]:
#Load the models
speech_analyzer = Speech_analysis(print_on = False)

# Biogen SCA study

This notebook runs the speech classification and severity estimation on the Biogen SCA study cohort and saves the output in a file

In [None]:
#survey_lex = pd.read_csv(data_home_dir + 'Biogen_SCA_Speech_Data/' + 'surveylex_questionIDs.csv',encoding='latin1')
survey_lex = pd.read_excel(data_home_dir + 'Biogen_SCA_Speech_Data/' + 'surveylex_questionIDs.xlsx',engine='openpyxl')
taskslist = survey_lex['question'].unique()
output = pd.DataFrame(columns = ['Date','subject_id','Neuropheno_ID','Bars','Bars_Speech','Sex','Diagnosis','DoB','Samples_no','Prob_AT_tf_grad', 'Bars_Speech_pred_t','Bars_Total_pred_t'])
                      
for tt in  taskslist: 
    if tt not in ["la_la_la","go_go_go","me_me_me", "lalala", "gogogo", "mememe","papapa"]:
        continue
    print(tt)
    pd.set_option('display.max_columns', 4)
    np.dtype(np.int16)
    task = str(tt)
    task_dir = str(tt)
    
    #loading the session file names and patient info
    
    study_name = 'Biogen_SCA_Speech_Data'
    study_dir = data_home_dir + 'Biogen_SCA_Speech_Data/'
    #survey_lex = pd.read_csv(study_dir + 'surveylex_questionIDs.csv',encoding='latin1')
    survey_lex = pd.read_excel(study_dir + 'surveylex_questionIDs.xlsx',engine='openpyxl')

    survey_response_A = pd.read_csv(study_dir + 'surveyA-responses-summary.csv',encoding='latin1') #pd.read_excel(study_dir + 'surveyA-responses-summary.xlsx',engine='openpyxl')
    survey_response_B = pd.read_csv(study_dir + 'surveyB-responses-summary.csv',encoding='latin1') #pd.read_excel(study_dir + 'surveyB-responses-summary.xlsx',engine='openpyxl')
    survey_response_C = pd.read_csv(study_dir + 'surveyC-responses-summary.csv',encoding='latin1') #pd.read_excel(study_dir + 'surveyC-responses-summary.xlsx',engine='openpyxl')
    survey_response_D = pd.read_csv(study_dir + 'surveyD-responses-summary.csv',encoding='latin1') #pd.read_excel(study_dir + 'surveyD-responses-summary.xlsx',engine='openpyxl')
    
    survey_response = pd.concat([survey_response_A, survey_response_B, survey_response_C, survey_response_D], ignore_index=True)
    
    survey_submissions_A = pd.read_csv(study_dir + 'surveyA-submissions-summary.csv',encoding='latin1') #pd.read_excel(study_dir + 'surveyA-submissions-summary.xlsx',engine='openpyxl')
    survey_submissions_B = pd.read_csv(study_dir + 'surveyB-submissions-summary.csv',encoding='latin1') #pd.read_excel(study_dir + 'surveyB-submissions-summary.xlsx',engine='openpyxl')
    survey_submissions_C = pd.read_csv(study_dir + 'surveyC-submissions-summary.csv',encoding='latin1') #pd.read_excel(study_dir + 'surveyC-submissions-summary.xlsx',engine='openpyxl')
    survey_submissions_D = pd.read_csv(study_dir + 'surveyD-submissions-summary.csv',encoding='latin1') #pd.read_excel(study_dir + 'surveyD-submissions-summary.xlsx',engine='openpyxl')
    

    survey_submissions = pd.concat([survey_submissions_A, survey_submissions_B, survey_submissions_C, survey_submissions_D], ignore_index=True)
    survey_submissions['Your participant ID:'] = pd.to_numeric(survey_submissions['Your participant ID:'], errors='coerce',downcast='integer')
    survey_submissions = survey_submissions[(survey_submissions['Your participant ID:'] > 30000) &(survey_submissions['Your participant ID:'] < 40000)]
    
    _Patient_info = pd.read_csv(data_home_dir + 'Subjects/Subjects_BiogenSCA_version2.csv',encoding='latin1')
    _Patient_info_ = _Patient_info.loc[:,['subject_id','neuropheno_id','diagnosis','bars_total_excl_miss_1','bars_total_excl_miss_2', 'bars_speech_1', 'bars_speech_2','sex']]

    Patient_info = pd.read_excel(data_home_dir + 'Subjects/' + Patient_info_recent,engine='openpyxl')
    Patient_info_ = Patient_info.loc[:,['subject_id','neuropheno_id','diagnosis','bars_total_excl_miss_1','bars_total_excl_miss_2', 'bars_speech_1', 'bars_speech_2','sex','dob']]
    
    #Check that the neurphone_ids are included
    for i in _Patient_info_['subject_id']:
        Patient_info_.loc[Patient_info_['subject_id'] == i,'neuropheno_id']  = _Patient_info_.loc[_Patient_info_['subject_id'] == i,'neuropheno_id']
    
    survey_lex_ = survey_lex.loc[:,['questionId','question']]
    survey_response_ = survey_response.loc[:,['sampleId','questionId', 'sessionId','createdDate']]
    survey_response_['createdDate'] = survey_response_['createdDate'].apply(lambda x: x[0:4] + '_' + x[5:7] + '_' + x[8:10] )
    survey_submissions_ = survey_submissions.loc[:,['sessionId','Your participant ID:']]
    survey_submissions_ = survey_submissions_.rename(columns={"Your participant ID:": "subject_id"})
    df = pd.merge(pd.merge(pd.merge(survey_response_,survey_lex_,on='questionId'), survey_submissions_, on = 'sessionId'),Patient_info_, on = 'subject_id')
    df.dropna(subset = ["sampleId"], inplace=True)

    df_task = df.loc[df['question'] == task][['sampleId','subject_id','neuropheno_id','diagnosis','createdDate','bars_total_excl_miss_1','bars_total_excl_miss_2', 'bars_speech_1', 'bars_speech_2','sex']]#, 'age']]
    df_task_numpy = df_task.to_numpy()
    
    list_diagnostic = ['Control', 'Ataxia', 'MSA', 'Other']

    
    #Going over all the files for the task
    list_names = []
    for entry in df_task_numpy:
        file_name = entry[0]
        ID = str(int(entry[1]))

        if  not np.isnan(entry[2]):
            Neur_phen_id = str(int(entry[2]))
        else:
            Neur_phen_id = ID
        
        diagn = entry[3]
        date = entry[4]
        Date = date[:4] + date[5:7] + date[8:]
        bars_total = (entry[5] + entry[6])/2.
        bars_speech = (entry[7] + entry[8])/2./10.
        sex = entry[9]
        #age = entry[10]
        
        #Reading the file if it exist
        path_to_file = study_dir + ID + '_' + date + '_A/' + file_name + '.wav'
        path_to_file_flag = 0
        
        if os.path.exists(path_to_file):
            path_to_file_flag = 1 
        
        if not os.path.exists(path_to_file):
            path_to_file = study_dir + ID + '_' + date + '_B/' + file_name + '.wav'
        else:
            path_to_file_flag = 1
        
        if not os.path.exists(path_to_file):
            path_to_file = study_dir + ID + '_' + date + '_C/' + file_name + '.wav'
        else:
            path_to_file_flag = 1
            
        if not os.path.exists(path_to_file):
            path_to_file = study_dir + ID + '_' + date + '_D/' + file_name + '.wav'
        else:
            path_to_file_flag = 1    
        
        if path_to_file_flag == 0: continue;
            
        #Assign label
        if diagn == 0:
            #diagn = 'Control'
            label = 0
        elif diagn == 1:
            #diagn = 'Ataxia'
            label = 1
        elif diagn == 2:
            #diagn = 'Ataxia'
            label = 1
        elif diagn == 3:
            #diagn = 'Ataxia'
            label = 1
        elif diagn == 7:
            #diagn = 'MSA'
            label = 7
        else:
            #diagn = 'Other'
            label = 5
        
        #Include only Controls and SCA patients
        #if label not in [0,1]:
        #    continue
        
        #Set the bars to 0 for all controls
        if label == 0:
            bars_total = 0
            bars_speech = 0 
        
        #Run the file through the models
        if Neur_phen_id == ID:
            speech_analyzer.load(path_to_file, Neur_phen_id)
            no_samples = speech_analyzer.sample_size_s
            Neur_phen_id = None
        else:
            speech_analyzer.load(path_to_file,Neur_phen_id)
            no_samples = speech_analyzer.sample_size_s
        
        if no_samples == 0:
            print(ID)
            continue
        
        #Store the model output
        output = output.append({'Date' : Date, 'subject_id' : ID, 'Neuropheno_ID' : Neur_phen_id, 'Bars': bars_total, 'Bars_Speech':bars_speech, 'Sex': sex, 'Diagnosis': diagn,'Samples_no' : speech_analyzer.sample_size_s, 'Prob_AT_tf_grad': speech_analyzer.classify(),
                               'Bars_Speech_pred_t': speech_analyzer.BARS_speech(), 'Bars_Total_pred_t': speech_analyzer.BARS_total()}, 
                ignore_index = True)

#Take the median of all samples by subject and session
output = output.astype(float).groupby(['Date','subject_id'], as_index = False).median().sort_values(by=['subject_id'])

In [None]:
speech_analyzer.plot_wav()

In [None]:
speech_analyzer.play_audio_nr()

In [None]:
speech_analyzer.plot_mel()

In [None]:
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 200)
np.dtype(np.int16)

In [None]:
output['subject_id'].unique()

In [None]:
for i in output['subject_id'].unique():
    output.loc[output['subject_id'] == i,'DoB'] = pd.to_datetime(Patient_info_.loc[Patient_info_['subject_id'] == i,'dob'].item())


In [None]:
output[['subject_id','Bars', 'Bars_Speech', 'Samples_no', 'Diagnosis','Prob_AT_tf_grad', 'Bars_Speech_pred_t','Bars_Total_pred_t']].head(300)

In [None]:
output[output['subject_id']>30030][['subject_id','Bars', 'Bars_Speech', 'Samples_no', 'Diagnosis','Prob_AT_tf_grad', 'Bars_Speech_pred_t','Bars_Total_pred_t']]

In [None]:
output.to_csv(outputfile_address, index = False)

In [None]:
output_sca1 = output[output['Diagnosis'] == 1]
output_sca2 = output[output['Diagnosis'] == 2]
output_sca3 = output[output['Diagnosis'] == 3]
output_c = output[output['Diagnosis'] == 0]

Below there some sanity check plots

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(1, 1, 1)
ax.scatter(output_c['Bars_Speech'], output_c['Prob_AT_tf_grad'], c = 'b' , marker = 'x')
ax.scatter(output_sca1['Bars_Speech'], output_sca1['Prob_AT_tf_grad'], c = 'k' , marker = 'x')
ax.scatter(output_sca2['Bars_Speech'], output_sca2['Prob_AT_tf_grad'], c = 'g' , marker = 'x')
ax.scatter(output_sca3['Bars_Speech'], output_sca3['Prob_AT_tf_grad'], c = 'r' , marker = 'x')
plt.xlim([-0.2, 4])
plt.ylim([-0.05, 1.05])

In [None]:
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(1, 1, 1)
plt.plot([0,4],[0,4])

ax.scatter(output_sca1['Bars_Speech'], output_sca1['Bars_Speech_pred_t'], c = 'k' , marker = 'x')
ax.scatter(output_sca2['Bars_Speech'], output_sca2['Bars_Speech_pred_t'], c = 'g' , marker = 'x')
ax.scatter(output_sca3['Bars_Speech'], output_sca3['Bars_Speech_pred_t'], c = 'r' , marker = 'x')
ax.scatter(output_c['Bars_Speech'], output_c['Bars_Speech_pred_t'], c = 'b' , marker = 'x')

plt.xlim([-0.2, 4])
plt.ylim([-0.05, 4])

In [None]:
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(1, 1, 1)
plt.plot([0,30],[0,30])
ax.scatter(output_c['Bars'], output_c['Bars_Total_pred_t'], c = 'b' , marker = 'x')
ax.scatter(output_sca1['Bars'], output_sca1['Bars_Total_pred_t'], c = 'k' , marker = 'x')
ax.scatter(output_sca2['Bars'], output_sca2['Bars_Total_pred_t'], c = 'g' , marker = 'x')
ax.scatter(output_sca3['Bars'], output_sca3['Bars_Total_pred_t'], c = 'r' , marker = 'x')
plt.xlim([-0.2, 30])
plt.ylim([-0.05, 30])

In [None]:
Patient_info = pd.read_excel(data_home_dir + 'Subjects/Subjects_BiogenSCA_with_questionnaires_2022_07_20_converted.xlsx',engine='openpyxl')

In [None]:
Patient_info.columns

In [None]:
Patient_info[['record_id','diagnosis']]

In [None]:
#survey_response_A = pd.read_excel(study_dir + 'surveyA-submissions-summary.xlsx',engine='openpyxl')
pd.read_csv(study_dir + 'surveyA-submissions-summary.csv',encoding='latin1')

In [None]:
survey_response_A