# BlaBla cross-language aphasia classification task

Example cross-language classification task using BlaBla features.

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import shuffle

## Load the data

We do not provide public access to the following files to protect patient privacy. For more information about accessing AphasiaBank data, see [this page](https://aphasia.talkbank.org/).

In [3]:
def load_ab_data(path, task, print_info=False):
    df_ab = pd.read_csv(path)
    
    if print_info:
        print(df_ab['task'].value_counts())
        
    df_ab = df_ab[df_ab['task'] == task]
    
    if print_info:
        print(df_ab.group.value_counts())
        
    return df_ab

Load the English AphasiaBank data.

In [None]:
df_eng_ab = load_ab_data(
    path='features_aphasiabank_english.csv',
    task='Cinderella',
    print_info=True,
)

Load the French AphasiaBank data.

In [None]:
df_fre_ab = load_ab_data(
    path='features_aphasiabank_french.csv',
    task='Cinderella',
    print_info=True,
)

Load the Mandarin AphasiaBank data.

In [None]:
df_man_ab = load_ab_data(
    path='features_aphasiabank_mandarin.csv',
    task='Cry_Wolf',
    print_info=True,
)

## Grouping

In [None]:
def get_class(row):
    if row.group in ['wernicke', 'anomic', 'conduction', 'broca', 'transmotor', 'global', 'aphasia', 'aa', 'ca', 'tma']:
        return 'aphasia'
    if row.group in ['control', 'normal']:
        return 'control'
    return 'None'

def get_fluency_class(row):
    if row.group in ['wernicke', 'anomic', 'conduction', 'aa', 'ca']:
        return 'fluent_aphasia'
    if row.group in ['broca', 'transmotor', 'global', 'tma']:
        return 'nonfluent_aphasia'
    if row.group in ['control', 'normal']:
        return 'control'
    return 'None'

Get the classifier classes and fluency-granularity classes for inspection.

In [None]:
df_eng_ab['class'] = df_eng_ab.apply(lambda row: get_class(row), axis=1)
df_fre_ab['class'] = df_fre_ab.apply(lambda row: get_class(row), axis=1)
df_man_ab['class'] = df_man_ab.apply(lambda row: get_class(row), axis=1)

df_eng_ab['fluency_class'] = df_eng_ab.apply(lambda row: get_fluency_class(row), axis=1)
df_fre_ab['fluency_class'] = df_fre_ab.apply(lambda row: get_fluency_class(row), axis=1)
df_man_ab['fluency_class'] = df_man_ab.apply(lambda row: get_fluency_class(row), axis=1)

Use only features well-defined for English, French and Mandarin and exclude those that scale linearly with transcript length.

In [None]:
features = [
    'noun_rate', 
    'verb_rate', 
    'adjective_rate', 
    'pronoun_rate', 
    'adverb_rate', 
    'conjunction_rate', 
    'possessive_rate', 
    'noun_verb_ratio', 
    'noun_ratio',
    'pronoun_noun_ratio', 
    'prop_close_class_words',
    'prop_open_class_words',
    'content_density', 
    'idea_density',
    'honore_statistic',
    'brunet_index',
    'type_token_ratio',
    'mean_word_length', 
    'prop_auxiliary_verbs',
    'prop_nouns_with_det',
    'prop_nouns_with_adjectives',
    'noun_phrases_rate',
    'verb_phrases_rate',
    'prepositional_phrases_rate',
    'dependent_clauses_rate',
    'max_yngve_depth',
    'mean_yngve_depth',
    'total_yngve_depth',
    'const_pt_height',
    'discourse_markers_rate'
]

Extract the features with the lasses and drop any rows with undefined features.

In [None]:
df_eng_ab_class = df_eng_ab[features+['class']].dropna()
df_fre_ab_class = df_fre_ab[features+['class']].dropna()
df_man_ab_class = df_man_ab[features+['class']].dropna()

Inspect the class and fluency class composition of the sets.

In [None]:
print(df_eng_ab['class'].value_counts())
print(df_eng_ab['fluency_class'].value_counts())

In [None]:
print(df_fre_ab['class'].value_counts())
print(df_fre_ab['fluency_class'].value_counts())

In [None]:
print(df_man_ab['class'].value_counts())
print(df_man_ab['fluency_class'].value_counts())

## Get balanced subsets

For simplicity, construct balanced subsets for training and testing.

In [None]:
def get_balanced_subset(df):
    df_shuf = df.sample(frac=1, random_state=42)
    df_aphasic = df_shuf[df_shuf['class'] == 'aphasia']
    df_control = df_shuf[df_shuf['class'] == 'control']
    
    num_samples = min(len(df_aphasic), len(df_control))
    print(f'Taking {num_samples:,} samples each from {len(df_aphasic):,} aphasia and {len(df_control):,} control samples')
    
    return df_aphasic[:num_samples], df_control[:num_samples]

In [None]:
df_eng_aph, df_eng_contr = get_balanced_subset(df_eng_ab_class)
df_fre_aph, df_fre_contr = get_balanced_subset(df_fre_ab_class)
df_man_aph, df_man_contr = get_balanced_subset(df_man_ab_class)

In [None]:
X_eng = pd.concat((df_eng_aph, df_eng_contr), axis=0)[feature_list]
y_eng = [True]*len(df_eng_aph) + [False]*len(df_eng_contr)
X_eng_train, X_eng_test, y_eng_train, y_eng_test = train_test_split(X_eng, y_eng, test_size=0.15, random_state=42, stratify=y_eng)

In [None]:
X_fre = pd.concat((df_fre_aph, df_fre_contr), axis=0)[feature_list]
y_fre = [True]*len(df_fre_aph) + [False]*len(df_fre_contr)

In [None]:
X_man = pd.concat((df_man_aph, df_man_contr), axis=0)[feature_list]
y_man = [True]*len(df_man_aph) + [False]*len(df_man_contr)

## Train classifier

Train on English train set.

In [None]:
scaler = StandardScaler()
X_eng_train_scaled = scaler.fit_transform(X_eng_train)

estimator = LinearSVC(random_state=42, max_iter=10000)
selector = RFE(estimator, n_features_to_select=5)
selector = selector.fit(X_eng_train_scaled, y_eng_train)

Inspect important features.

In [None]:
feats = [x for i, x in enumerate(selected_columns) if selector.support_[i]]
ranked_features = list(zip(feats, selector.estimator_.coef_[0]))
ranked_features.sort(key=lambda x: abs(x[1]), reverse=True)

for i, (feature, coef) in enumerate(ranked_features):
    print(i+1, feature, coef)

Validate on English test set.

In [None]:
X_eng_test_scaled  = scaler.transform(X_eng_test)
y_eng_test_pred = selector.predict(X_eng_test_scaled)
print(classification_report(y_eng_test, y_eng_test_pred, target_names=['Control', 'Aphasia'], digits=4))
print('Baseline accuracy', max(np.mean(y_eng_test), 1-np.mean(y_eng_test)))

Validate on French test set.

In [None]:
X_fre_scaled = scaler.transform(X_fre)
y_fre_pred = selector.predict(X_fre_scaled)
print(classification_report(y_fre, y_fre_pred, target_names=['Control', 'Aphasia'], digits=4))
print('Baseline accuracy', max(np.mean(y_fre), 1-np.mean(y_fre)))

Validate on Mandarin test set.

In [None]:
X_man_scaled = scaler.transform(X_man)
y_man_pred = selector.predict(X_man_scaled)
print(classification_report(y_man, y_man_pred, target_names=['Control', 'Aphasia'], digits=4))
print('Baseline accuracy', max(np.mean(y_man), 1-np.mean(y_man)))