# Visualization of BlaBla linguistic features for aphasia

2D t-SNE visualization of linguistic features of fluent/nonfluent aphasia sufferers and healthy controls.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize

In [None]:
def save_fig(fig, path_base, close=False):
    file_path = f'{path_base}.png'
    fig.savefig(file_path, bbox_inches='tight')
    if close:
        plt.close(fig)

## Load data

We do not provide public access to the following files to protect patient privacy. For more information about accessing AphasiaBank data, see [this page](https://aphasia.talkbank.org/).

### English data

Load the English AphasiaBank features calculated using BlaBla.

In [None]:
df_eng_ab = pd.read_csv('features_aphasiabank_english.csv')
df_eng_ab = df_eng_ab.dropna()
df_eng_ab['task'].value_counts()

In [None]:
df_eng_ab = df_eng_ab[df_eng_ab['task'] == 'Cinderella']
df_eng_ab.group.value_counts()

### Mandarin data

Load the Mandarin AphasiaBank features calculated using BlaBla.

In [None]:
df_man_ab = pd.read_csv('features_aphasiabank_mandarin.csv')
df_man_ab = df_man_ab.dropna()
df_man_ab = df_man_ab.replace({
    'normal': 'control',
    'aa': 'anomic',
    'ca': 'conduction',
    'tma': 'transmotor',
})
df_man_ab['task'].value_counts()

In [None]:
df_man_ab = df_man_ab[df_man_ab['task'] == 'Cry_Wolf']
df_man_ab.group.value_counts()

### French data

Load the French AphasiaBank features calculated using BlaBla.

In [None]:
df_fre_ab = pd.read_csv('features_aphasiabank_french.csv')
df_fre_ab = df_fre_ab.dropna()
df_fre_ab['task'].value_counts()

In [None]:
df_fre_ab = df_fre_ab[df_fre_ab['task'] == 'Cinderella']
df_fre_ab.group.value_counts()

## Grouping strategies

We consider two different granularities of groupings for both English and on a cross-lingual basis.

### Disease groupings

In [None]:
def get_group_a(row):
    if row.group in ['wernicke', 'anomic', 'conduction', 'broca', 'transmotor', 'global', 'aphasia']:
        return 'aphasia'
    if row.group in ['control']:
        return 'control'
    return 'None'

In [None]:
def get_group_b(row):
    if row.group in ['wernicke', 'anomic', 'conduction']:
        return 'fluent_aphasia'
    if row.group in ['broca', 'transmotor', 'global']:
        return 'nonfluent_aphasia'
    if row.group in ['control']:
        return 'control'
    return 'None'

In [None]:
def get_group_a_lang(row, lang):
    prefix = lang + '_'
    if row.group in ['wernicke', 'anomic', 'conduction', 'broca', 'transmotor', 'global', 'aphasia']:
        return prefix+'aphasia'
    if row.group in ['control']:
        return prefix+'control'
    return prefix+'None'

In [None]:
def get_group_b_lang(row, lang):
    prefix = lang + '_'
    if row.group in ['wernicke', 'anomic', 'conduction']:
        return prefix+'fluent_aphasia'
    if row.group in ['broca', 'transmotor', 'global']:
        return prefix+'nonfluent_aphasia'
    if row.group in ['control']:
        return prefix+'control'
    return prefix+'None'

In [None]:
df_eng_ab['group_a'] = df_eng_ab.apply(lambda row: get_group_a_lang(row, 'english'), axis=1)
df_eng_ab['group_b'] = df_eng_ab.apply(lambda row: get_group_b_lang(row, 'english'), axis=1)

df_man_ab['group_a'] = df_man_ab.apply(lambda row: get_group_a_lang(row, 'mandarin'), axis=1)
df_man_ab['group_b'] = df_man_ab.apply(lambda row: get_group_b_lang(row, 'mandarin'), axis=1)

df_fre_ab['group_a'] = df_fre_ab.apply(lambda row: get_group_a_lang(row, 'french'), axis=1)
df_fre_ab['group_b'] = df_fre_ab.apply(lambda row: get_group_b_lang(row, 'french'), axis=1)

In [None]:
df_ab = pd.concat([df_eng_ab, df_man_ab, df_fre_ab])

In [None]:
df_ab['group_a'].value_counts()

In [None]:
df_ab['group_b'].value_counts()

## Plot disease t-SNE

Plot BlaBla features for the fluent aphasia vs nonfluent aphasia vs healthy control granularity grouping.

In [None]:
cols = sns.color_palette('muted')
all_features = [
    'noun_rate',
    'verb_rate',
    'demonstrative_rate',
    'adjective_rate',
    'pronoun_rate',
    'adverb_rate',
    'conjunction_rate',
    'possessive_rate',
    'noun_verb_ratio',
    'noun_ratio',
    'pronoun_noun_ratio',
    'prop_close_class_words',
    'prop_open_class_words',
    'content_density',
    'idea_density',
    'honore_statistic',
    'brunet_index',
    'type_token_ratio'
    ,'mean_word_length',
    'prop_inflected_verbs',
    'prop_auxiliary_verbs',
    'prop_gerund_verbs',
    'prop_participle_verbs',
    'num_clauses',
    'num_clauses_per_sentence',
    'prop_nouns_with_det',
    'prop_nouns_with_adjectives',
    'num_noun_phrases',
    'noun_phrases_rate',
    'num_verb_phrases',
    'verb_phrases_rate',
    'num_infinitive_phrases',
    'infinitive_phrases_rate',
    'num_prepositional_phrases',
    'prepositional_phrases_rate',
    'num_dependent_clauses',
    'dependent_clauses_rate',
    'max_yngve_depth',
    'mean_yngve_depth',
    'total_yngve_depth',
    'const_pt_height',
    'num_discourse_markers',
    'discourse_markers_rate'
]

In [None]:
def plot_tsne(data, col_for_labels, col_map, label_map, perplexity=30, n_iter=2000, exclusion_list=[]): 
    # Exclude any features in the exclusion list.
    selected_columns = [x for x in all_features if x not in exclusion_list]

    # Normalize the data and calculate the t-SNE projection.
    data_np = data[selected_columns]
    data_np = normalize(data_np, axis=0)
    tsne = TSNE(n_components=2, verbose=0, perplexity=perplexity, n_iter=n_iter, n_jobs=-1)
    tsne_results = tsne.fit_transform(data_np)

    # Plot the t-SNE by group.
    groups = pd.DataFrame(tsne_results, columns=['x', 'y']).assign(category=data[col_for_labels].values).groupby('category')
    fig = plt.figure(figsize=(4, 4), dpi=300)
    
    paths, legend_texts = [], []
    for label in label_map.keys():
        points = groups.get_group(label)
        print(label, len(points.x))
        paths.append(plt.scatter(points.x, points.y, label=label_map[label], c=[col_map[label][0]], alpha=col_map[label][1], lw=0, s=40))
        legend_texts.append(label_map[label])

    plt.axis('off')
    
    figlegend_h = plt.figure(figsize=(4,2), dpi=300)
    figlegend_h.legend(paths, legend_texts, 'center', ncol=10) 
    
    figlegend_v = plt.figure(figsize=(4,2), dpi=300)
    figlegend_v.legend(paths, legend_texts, 'center', ncol=1) 
    
    return fig, figlegend_h, figlegend_v

In [None]:
fig, _, figlegend_v = plot_tsne(df_eng_ab, 'group_b',
               col_map={
                   'english_fluent_aphasia': (cols[0], 0.6),
                   'english_nonfluent_aphasia': (cols[4], 0.6),
                   'english_control': (cols[1], 0.6),
               },
               label_map={
                   'english_fluent_aphasia': 'Fluent aphasia',
                   'english_nonfluent_aphasia': 'Nonfluent aphasia',
                   'english_control': 'Controls',
               })

save_fig(fig, 'disease_tsne_group_b')
save_fig(figlegend_v, 'disease_tsne_group_b_vlegend')

## Plot language t-SNE

To promote cross-task/-language generalizability, exclude features which are undefined in Mandarin and those which scale ~linearly with speech length.

In [None]:
exclusion_list = [
    'demonstrative_rate',
    'infinitive_phrases_rate',
    'num_infinitive_phrases',
    'prop_gerund_verbs',
    'prop_inflected_verbs',
    'prop_participle_verbs',
    'num_verb_phrases',
    'num_clauses',
    'num_clauses_per_sentence',
    'num_dependent_clauses',
    'num_discourse_markers',
    'num_noun_phrases',
    'num_prepositional_phrases',
]

In [None]:
fig, _, figlegend_v = plot_tsne(df_ab, 'group_a', perplexity=30, exclusion_list=exclusion_list,
               col_map={
                   'english_aphasia': (cols[0], 0.6),
                   'english_control': (cols[1], 0.6),
                   'mandarin_aphasia': (cols[2], 0.6),
                   'mandarin_control': (cols[3], 0.6),
                   'french_aphasia': (cols[4], 0.6),
                   'french_control': (cols[5], 0.6),
               },
               label_map={
                   'english_aphasia': 'English aphasia',
                   'english_control': 'English controls',
                   'mandarin_aphasia': 'Mandarin aphasia',
                   'mandarin_control': 'Mandarin controls',
                   'french_aphasia': 'French aphasia',
                   'french_control': 'French controls',
               })

save_fig(fig, 'language_tsne_group_a')
save_fig(figlegend_v, 'language_tsne_group_a_vlegend')