Data Cleaning: Checking for missing values, duplicates, and potentially irrelevant columns

In [None]:

# Checking for missing values
missing_values = data.isnull().sum()

# Checking for duplicate rows
duplicate_rows = data.duplicated().sum()

# Displaying the summary of missing values and duplicate rows
missing_values_summary = missing_values[missing_values > 0]
missing_values_count = missing_values_summary.count()
missing_values_info = missing_values_summary if missing_values_count > 0 else "No missing values"
duplicate_rows_info = f"Duplicate Rows: {duplicate_rows}" if duplicate_rows > 0 else "No duplicate rows"

missing_values_info, duplicate_rows_info


Explores the clinical variables as a function of age, sex, testing location, participant group 

In [1]:
# Demographic and testing variables in the latest dataset
demographic_testing_vars_latest = ["sex", "participant_group", "testing_location", "age_screening"]

# Creating box plots for comparing clinical variables across categorical demographic and testing variables
def create_comparison_boxplots_latest(data, clinical_vars, demographic_vars):
    for demo_var in demographic_vars:
        if data[demo_var].dtype == 'object' or demo_var == "participant_group":  # Categorical variables
            for clin_var in clinical_vars:
                plt.figure(figsize=(8, 4))
                sns.boxplot(x=data[demo_var], y=data[clin_var])
                plt.title(f'{clin_var} by {demo_var}')
                plt.xticks(rotation=45)
                plt.show()

# Creating scatter plots for age
def create_age_scatterplots_latest(data, clinical_vars):
    for clin_var in clinical_vars:
        plt.figure(figsize=(8, 4))
        sns.scatterplot(x=data["age_screening"], y=data[clin_var])
        plt.title(f'{clin_var} vs Age')
        plt.show()

# Box plots for categorical demographic and testing variables
create_comparison_boxplots_latest(latest_data, clinical_vars_latest, demographic_testing_vars_latest)

# Scatter plots for age
create_age_scatterplots_latest(latest_data, clinical_vars_latest)


NameError: name 'latest_data' is not defined

In [None]:
# Categorizing variables into Clinical, Demographic, and Speech variables
clinical_variables = [col for col in data.columns if col.startswith(('gad7', 'qids', 'hamd17'))]
demographic_variables = ['sex', 'age_screening', 'participant_group', 'age_learned_english', 'testing_location']
speech_variables = [col for col in data.columns if col not in clinical_variables + demographic_variables]

# Creating subsets of data based on these categories
clinical_data = data[clinical_variables]
demographic_data = data[demographic_variables]
speech_data = data[speech_variables]

# Checking the number of variables in each category
len_clinical = len(clinical_variables)
len_demographic = len(demographic_variables)
len_speech = len(speech_variables)

len_clinical, len_demographic, len_speech, clinical_variables[:5], demographic_variables, speech_variables[:5]  # Displaying first 5 variables from each category for verification


Looking for strong correlations between speech variables at baseline

In [None]:
# Counting the frequency of each variable in the correlation pairs
variable_counts = pd.concat([strong_correlation_pairs_without_mfcc['Variable 1'], 
                             strong_correlation_pairs_without_mfcc['Variable 2']]).value_counts()

# Identifying variables to be removed
variables_to_remove = []
for _, row in strong_correlation_pairs_without_mfcc.iterrows():
    if row['Variable 1'] not in variables_to_remove and row['Variable 2'] not in variables_to_remove:
        # Remove the variable that appears more frequently
        if variable_counts[row['Variable 1']] > variable_counts[row['Variable 2']]:
            variables_to_remove.append(row['Variable 1'])
        else:
            variables_to_remove.append(row['Variable 2'])

# Removing the identified variables from the dataset
reduced_speech_data = speech_data_without_mfcc.drop(columns=variables_to_remove)

# Number of variables removed and remaining
num_variables_removed = len(variables_to_remove)
num_variables_remaining = reduced_speech_data.shape[1]

num_variables_removed, num_variables_remaining

# Creating a list of removed variables for display
removed_variables_list = sorted(variables_to_remove)

removed_variables_list

In [None]:
import numpy as np

# Recalculating with numpy imported
strong_correlations = speech_correlation_matrix[(speech_correlation_matrix >= strong_correlation_threshold) | 
                                                (speech_correlation_matrix <= -strong_correlation_threshold)]

# Filtering out the diagonal and lower triangle to avoid duplicate correlations
strong_correlations = strong_correlations.where(
    np.triu(np.ones(strong_correlations.shape), k=1).astype(np.bool)
)

# Flattening the matrix to a list of variable pairs with their correlation
strong_correlation_pairs = strong_correlations.stack().reset_index()
strong_correlation_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']

strong_correlation_pairs.head(10)  # Displaying the first 10 strong correlation pairs

T-tests exploring differences in speech variables according to sex, participant group

In [None]:
from scipy.stats import ttest_ind

# Function to perform t-tests for a given variable across two groups
def perform_t_test(data, variable, group_column, group1, group2):
    group1_data = data[data[group_column] == group1][variable].dropna()
    group2_data = data[data[group_column] == group2][variable].dropna()
    t_stat, p_value = ttest_ind(group1_data, group2_data)
    return t_stat, p_value

# Selecting a few variables for t-test
selected_variables = ['speech_rate', 'fundamental_frequency_mean', 'hnr_ac_min']

# Performing t-tests for Participant Group (Control vs MDD)
t_test_results_participant_group = {var: perform_t_test(merged_data_reanalysis, var, 'participant_group', 'Control', 'MDD') 
                                    for var in selected_variables}

# Performing t-tests for Sex (Male vs Female)
t_test_results_sex = {var: perform_t_test(merged_data_reanalysis, var, 'sex', 'Male', 'Female') 
                      for var in selected_variables}

t_test_results_participant_group, t_test_results_sex

Heat map showing correlations between speech variables and clinical scores (baseline)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load your dataset
data = pd.read_csv('path_to_your_dataset.csv')

# Define your clinical and speech variables of interest
clinical_variables = ['hamd17_total_pre', 'hamd17_depression_pre', 'hamd17_insomnia_pre', 
                      'hamd17_somatic_pre', 'hamd17_anxiety_pre', 'hamd17_suicide_pre', 
                      'qids_total_pre', 'gad7_total_pre']
speech_variables = ['list_of_speech_variables']  # Replace with your speech variables

# Select the relevant data
selected_data = data[clinical_variables + speech_variables]

# Calculate the correlation matrix
correlation_matrix = selected_data.corr()

# Create the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap Between Clinical and Speech Variables')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
