# Statistics (under work)
v0.2.0
By Stephen Karl Larroque
License: All rights reserved (in the future will be converted to MIT)

In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

In [None]:
# AUX FUNCTIONS

import os, sys

cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for unidecode and cleanup_name, because it does not support relative paths (yet?)

import re

from csg_fileutil_libs.aux_funcs import save_df_as_csv, _tqdm, compute_best_diag, reorder_cols_df, find_columns_matching, cleanup_name, replace_buggy_accents, convert_to_datetype, df_drop_duplicated_index, df_to_unicode, df_to_unicode_fast, cleanup_name_df, df_literal_eval, compute_best_diag, df_unify, df_translate, df_filter_nan_str, concat_vals_unique, reorder_cols_df, sort_and_deduplicate


In [None]:
# Nice plots!
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
# PARAMETERS

# Unified database, not yet postprocessed
unified_csv = r'databases_output\merged_fmp_steph_manon_sarah_dicom_ecg_reports_unifiedall_nifti.csv'
unifiedpersubj_csv = r'databases_output\merged_fmp_steph_manon_sarah_dicom_ecg_reports_unifiedall.csv'
output_dir = r'databases_output'

# Hide null values in plots?
plot_hide_nan = True

diagorder_doc = ['', 'na', 'impossible', 'braindead', 'coma', 'vs/uws', 'mcs', 'mcs-', 'mcs+', 'srmcs', 'emcs', 'lis', 'lis_incomplete', 'partial lis']


---------------
## PREPARE DATASET (AND ONLYDOC DATASET)

In [None]:
# Import the csv dbs as dataframes
import pandas as pd
import numpy as np

cf_unified = pd.read_csv(unified_csv, sep=';', low_memory=False).dropna(axis=0, how='all').fillna('')  # drop empty lines
cf_unified

In [None]:
cf_unified['unified.diagnosis_best'].unique()

In [None]:
# filter to keep only doc patients (susceptible to being sedated)
cf_unified_onlydoc = cf_unified[cf_unified['unified.diagnosis_best'].isin(['vs/uws', 'mcs', 'mcs+', 'mcs-', 'emcs', 'srmcs', 'coma', 'lis', 'lis_incomplete', 'partial lis', 'conflict', 'braindead'])]


In [None]:
# Group by name
cf_unified_onlydoc_byname = cf_unified_onlydoc.groupby('name').agg(concat_vals_unique)

In [None]:
# Check diagnoses count is fine (sanity check)
cf_unified_onlydoc_byname.reset_index().loc[:, ['name', 'unified.diagnoses_count']]

In [None]:
save_df_as_csv(cf_unified_onlydoc_byname, 'onlydoc.csv', fields_order=False)

----------------------
## FOR MURIELLE (MRI SEDATION STATS)

In [None]:
with open('bynamecounts.txt', 'w') as f:
    f.write(cf_unified_onlydoc_byname.count().to_string())
cf_unified_onlydoc_byname.count()

In [None]:
cf_unified_onlydoc[cf_unified_onlydoc['nifti.func OK'].isin(['O', 'M', 'M2', 'N'])].groupby('name').agg(concat_vals_unique).count()

In [None]:
cf_unified_onlydoc[cf_unified_onlydoc['nifti.struct OK (for fmri)'].isin(['O', 'M', 'M2', 'N', 'W'])].groupby('name').agg(concat_vals_unique).count()

In [None]:
# Agregate per MRI sessions
cf_unified_onlydoc_sess = cf_unified_onlydoc[~cf_unified_onlydoc['StudyDate'].isnull() & (cf_unified_onlydoc['StudyDate'] != '')].groupby(['name', 'StudyDate']).agg(concat_vals_unique)
cf_unified_onlydoc_sess

In [None]:
cf_unified_onlydoc_sess[~cf_unified_onlydoc_sess['nifti.func OK'].isin(['X', ''])].count()

In [None]:
cf_unified_onlydoc_sess[~cf_unified_onlydoc_sess['nifti.struct OK (for fmri)'].isin(['X', ''])].count()

In [None]:
def saveepisedat(cf, appendtext=''):
    a = cf['unified.episedation']
    b = a.astype('str').value_counts()
    c = b.to_frame().reset_index().rename(columns={'index': 'sedation', 'unified.episedation': 'count'})
    df_to_unicode_fast(c).to_excel(unified_csv[:-4] + '_episedationcount%s.xls' % appendtext)
    return True
saveepisedat(cf_unified_onlydoc_sess, '_persess')
saveepisedat(cf_unified_onlydoc_byname, '_persubject')

In [None]:
fig = plt.figure()
#toplot = cf_unified_perdiag[cf_unified_perdiag['unified.diagnosis_worst'] == diag]['unified.diagnosis_best'].astype('str').value_counts(dropna=plot_hide_nan)
cf_unified_onlydoc_byname['unified.etiology'].value_counts().plot(fig=fig, kind='pie', title='Etiology of DOC patients\n%i patients' % (cf_unified_onlydoc_byname.shape[0]), autopct='%.1f%%', figsize=(15,15))
plt.axis('off')
fig.savefig(os.path.join(output_dir, 'fig_docetio.png'), bbox_inches='tight', dpi=600)
with open(os.path.join(output_dir, 'fig_docetio.txt'), 'w') as f:
    f.write(cf_unified_onlydoc_byname['unified.etiology'].value_counts().to_string())

In [None]:
import codecs
cf_unified_onlydoc_sess.loc[cf_unified_onlydoc_sess['unified.diagnosis_best'] == 'srmcs', 'unified.diagnosis_best'] = 'mcs+'
for diag in cf_unified_onlydoc_sess['unified.diagnosis_best'].unique():
    fig = plt.figure()
    toplot = cf_unified_onlydoc_sess.loc[cf_unified_onlydoc_sess['unified.diagnosis_best'] == diag, 'unified.episedation']
    toplot.value_counts().plot(fig=fig, kind='pie', title='Sedation for diag %s\n%i sessions' % (diag.replace('/', '-'), toplot.shape[0]), autopct='%.1f%%', figsize=(15,15))
    plt.axis('off')
    fig.savefig(os.path.join(output_dir, 'fig_sedat_%s.png' % diag.replace('/', '-')), bbox_inches='tight', dpi=600)
    with codecs.open(os.path.join(output_dir, 'fig_sedat_%s.txt' % diag.replace('/', '-')), 'w', 'utf-8-sig') as f:
        f.write(toplot.to_string())

--------------------
## MARKOV CHAIN

In [None]:
# Import the csv dbs as dataframes
import pandas as pd
import numpy as np

cf_unifiedsubj = pd.read_csv(unifiedpersubj_csv, sep=';', low_memory=False).dropna(axis=0, how='all').fillna('')  # drop empty lines
cf_unifiedsubj

In [None]:
cf_unifiedsubj['unified.diagnosis_best'].unique()

In [None]:
# filter to keep only doc patients (susceptible to being sedated)
cf_unifiedsubj_onlydoc = cf_unifiedsubj[cf_unifiedsubj['unified.diagnosis_best'].isin(['vs/uws', 'mcs', 'mcs+', 'mcs-', 'emcs', 'srmcs', 'coma', 'lis', 'lis_incomplete', 'partial lis', 'conflict', 'braindead'])]
cf_unifiedsubj_onlydoc


In [None]:
cf_unified_onlydoc_byname

In [None]:
cf_unified_onlydoc_byname.reset_index().loc[:, ['name', 'unified.diagnoses_count']]

In [None]:
find_columns_matching(cf_unified_onlydoc_byname, ['count'])

In [None]:
# Extract max crsr count
cf_unified_onlydoc_byname['unified.diagnoses_count'] = cf_unified_onlydoc_byname['unified.diagnoses_count'].apply(lambda x: max(x) if isinstance(x, list) else x)
cf_unified_onlydoc_byname['unified.diagnoses_count']

In [None]:
# Select only patients with at least 2 CRS-Rs (else can't see any transition)
cf_unified_onlydoc_byname_min2diag = cf_unified_onlydoc_byname.loc[cf_unified_onlydoc_byname['unified.diagnoses_count'] >= 2, :]
# Drop the 'test test' patient
cf_unified_onlydoc_byname_min2diag.drop('test test', inplace=True)
# Show
cf_unified_onlydoc_byname_min2diag

In [None]:
cf_unified_onlydoc_byname_min2diag['unified.diagnoses_count'].plot(kind='hist', bins=max(cf_unified_onlydoc_byname_min2diag['unified.diagnoses_count']))

In [None]:
# Show cases where there are multiple best or worst diagnoses (which should not happen)
conflictdiags = cf_unified_onlydoc_byname_min2diag.loc[cf_unified_onlydoc_byname_min2diag['unified.diagnosis_worst'].apply(lambda x: isinstance(x, list)), :].index
cf_unified_onlydoc_byname_min2diag.loc[conflictdiags, find_columns_matching(cf_unified_onlydoc_byname_min2diag, 'unified')]

In [None]:
# Fix cases where there are multiple best/worst diagnoses, by selecting the best/worst diagnosis respectively

# Order diagnoses using Pandas discrete categories, so that we can easily grade the maximum and minimum diagnoses
cf_unified_onlydoc_byname_min2diag.loc[:, 'unified.diagnosis_worst'] = cf_unified_onlydoc_byname_min2diag['unified.diagnosis_worst'].apply(lambda x: compute_best_diag(x, diag_order=diagorder_doc, persubject=None).min() if not isinstance(x, str) else x)
cf_unified_onlydoc_byname_min2diag.loc[:, 'unified.diagnosis_best'] = cf_unified_onlydoc_byname_min2diag['unified.diagnosis_best'].apply(lambda x: compute_best_diag(x, diag_order=diagorder_doc, persubject=None).max() if not isinstance(x, str) else x)


In [None]:
# Sanity check if the previous docs with conflicting diagnoses are now ok
cf_unified_onlydoc_byname_min2diag.loc[conflictdiags, find_columns_matching(cf_unified_onlydoc_byname_min2diag, 'unified')]

In [None]:
#a = cf_unified_onlydoc_byname_min2diag.loc[cf_unified_onlydoc_byname_min2diag['unified.diagnosis_worst'].apply(lambda x: isinstance(x, list)), find_columns_matching(cf_unified_onlydoc_byname_min2diag, 'unified')]
# correct:
#print(a['unified.diagnosis_worst'].apply(lambda x: compute_best_diag(x, diag_order=['', 'na', 'impossible'] + diagorder_doc + ['lis'], persubject=None).min()))
#print(a['unified.diagnosis_worst'].apply(lambda x: compute_best_diag(x, diag_order=['', 'na', 'impossible'] + diagorder_doc + ['lis'], persubject=None).max()))
# wrong:
#print(a['unified.diagnosis_worst'].apply(lambda x: min(compute_best_diag(x, diag_order=['', 'na', 'impossible'] + diagorder_doc + ['lis'], persubject=None)))
#print(a['unified.diagnosis_worst'].apply(lambda x: max(compute_best_diag(x, diag_order=['', 'na', 'impossible'] + diagorder_doc + ['lis'], persubject=None)))

In [None]:
#valsorder = ['AAA', 'BBBBBBBBB', 'CCCCC', 'DD', 'EEE']
#s = pd.Series(valsorder[1:4])
#s = s.astype(pd.api.types.CategoricalDtype(categories=valsorder, ordered=True))
#min(s)

In [None]:
def calc_transition_matrix(df, col1, col2, proba=True):
    """proba == True to return probabilities, or False to return counts"""
    try:
        tmat = pd.DataFrame(0, index=df[col1].unique(), columns=df[col2].unique())
    except TypeError as exc:
        tmat = pd.DataFrame(0, index=df[col1].astype('str').unique(), columns=df[col2].astype('str').unique())
    for idx, row in df.iterrows():
        tmat.loc[row[col1], row[col2]] += 1
    if proba:
        tmat = tmat.apply(lambda x: x / x.sum(), axis=1)
    return tmat

tmat = calc_transition_matrix(cf_unified_onlydoc_byname_min2diag, 'unified.diagnosis_worst', 'unified.diagnosis_best', proba=True)
tmat

In [None]:
plt.matshow(tmat)

In [None]:
# Reorder columns and indices
tmat = tmat.loc[:, [x for x in diagorder_doc if x in tmat.columns]]  # easiest way: get the whole ordered list and filter it through the existing columns
tmat = tmat.loc[[x for x in diagorder_doc if x in tmat.index], :]

In [None]:
# LIMITATIONS OF THIS STUDY:
# * does not account for temporality between worst and best diagnosis, thus worst diagnosis may well be an evolution happening later than the best diagnosis. Here we show the possible transitions between both, should be considered bidirectional. Thus interpretation is not necessarily of an evolution but a possible transition between both states.
# we could change that but what criterion should we use? And what timeframe, if it's a daytoday fluctuation, should we consider this is ...? Or simply restrict analysis to all crs-r timeframe under 3 months, so we consider it's not evolution, only fluctuation or short term evolution.

# RESULTS
#* most change diag, dont be fooled by the heatmap, so this and graph are bad viz, they dont show the main result. Problem with heatmap is the colors: how do you add the colors to know that in fact where it's most salient isn't the majority of the changes?
#* SOLUTION: add 3 columns: worsening, no change and improvement, and these will be the sum of enhancement vs no change vs worsening. Simple to calculate: same position in x and y = no change, below position in columns compared to index = worsening, opposite is improvement.
#* srmcs 50% chance change to emcs. we question the pertinence of requiring 2 consecutive fulfillment of the tasks

# TODO:
#* account for bidirectionality by detecting order of worst and best diag?

In [None]:
def plotheatmap(df):
    df[df==0] = float('NaN')  # make 0 values blank
    plt.pcolor(df, cmap=plt.get_cmap('viridis'))
    plt.yticks(np.arange(0.5, len(df.index), 1), df.index)
    plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns)
    plt.show()
plotheatmap(tmat)

In [None]:
from __future__ import division  # Only for how I'm writing the transition matrix
import networkx as nx  # For the magic
import matplotlib.pyplot as plt  # For plotting

# Install pydot and graphviz beforehand, and change the path below on Windows to your graphviz folder

os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

# and the following code block is not needed
# but we want to see which module is used and
# if and why it fails
try:
    import pygraphviz
    from networkx.drawing.nx_agraph import write_dot
    print("using package pygraphviz")
except ImportError:
    try:
        import pydot
        from networkx.drawing.nx_pydot import write_dot, to_pydot
        print("using package pydot")
    except ImportError:
        print()
        print("Both pygraphviz and pydot were not found ")
        print("see  https://networkx.github.io/documentation/latest/reference/drawing.html")
        print()
        raise

def transition_to_graph(df):
    # Adapted from https://vknight.org/unpeudemath/code/2015/11/15/Visualising-markov-chains.html
    G = nx.MultiDiGraph(directed=True)
    labels={}
    edge_labels={}

    for state1 in df.index:
        for state2 in df.columns:
            weight = df.loc[state1, state2]
            if weight > 0:
                G.add_edge(state1,
                           state2,
                           weight=weight,
                           penwidth=weight*10,
                           label="{:.02f}".format(weight))
                edge_labels[(state1, state2)] = label="{:.02f}".format(weight)
    return G

def plot_transition_graph(G, pos=None):
    # https://stackoverflow.com/questions/20133479/how-to-draw-directed-graphs-using-networkx-in-python
    plt.figure(figsize=(14,7))
    #node_size = 200
    #pos = {state:list(state) for state in states}
    #nx.draw_networkx_edges(G,pos,width=1.0,alpha=0.5)
    #nx.draw_networkx_labels(G, pos, font_weight=2)
    options = {
        'node_color': 'cyan',
        'node_size': 2000,
        'width': 1,
        'arrowstyle': '-|>',
        'arrowsize': 30,
    }
    if pos is None:
        # Get the layout defined manually in G
        pos = nx.get_node_attributes(G,'pos')
        if not pos:
            # Else calculate a layout automatically
            #pos = nx.nx_pydot.graphviz_layout(G, prog='neato')
            pos = nx.drawing.layout.spectral_layout(G)
    nx.draw(G, pos, arrows=True, with_labels=True, **options)
    labels = nx.get_edge_attributes(G, 'weight')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=labels)
    plt.axis('off')

def plot_transition_graph2(G, pos=None):
    # add graphviz layout options (see https://stackoverflow.com/a/39662097)
    G.graph['edge'] = {'arrowsize': '0.6', 'splines': 'curved', 'rankdir':'LR'}
    G.graph['graph'] = {'scale': '3'}

    # adding attributes to edges in multigraphs is more complicated but see
    # https://stackoverflow.com/a/26694158
    #G[1][1][0]['color']='red'

    # From: https://stackoverflow.com/questions/4596962/display-graph-without-saving-using-pydot
    # convert from networkx -> pydot
    pydot_graph = to_pydot(G)

    # render pydot by calling dot, no file saved to disk
    png_str = pydot_graph.create_png(prog='dot') # can change to dot or twopi, but not neato because the latter is only for non directed graphs

    # treat the dot output string as an image file
    sio = StringIO()
    sio.write(png_str)
    sio.seek(0)
    img = mpimg.imread(sio)

    # plot the image
    plt.figure(figsize=(40,15))
    imgplot = plt.imshow(img, aspect='equal')
    plt.axis('off')
    plt.show(block=False)

from cStringIO import StringIO
import matplotlib.image as mpimg

G = transition_to_graph(tmat)
# set position manually
#for i, n in enumerate(G):
#    G.node[n]['pos'] = '"%d,%d"' % (i, 1)
write_dot(G, 'mc.dot')
plot_transition_graph2(G)

In [None]:
# convert from networkx -> pydot
pydot_graph = to_pydot(G)
pydot_graph.set_concentrate(True)
pydot_graph.set_layout('dot')
pydot_graph.set_dpi(300)
pydot_graph.set_pack(True)
#pydot_graph.set_rank('same')
pydot_graph.set_splines('line')

# render pydot by calling dot, no file saved to disk
png_str = pydot_graph.create_png(prog='dot') # can change to dot or twopi, but not neato because the latter is only for non directed graphs

# treat the dot output string as an image file
sio = StringIO()
sio.write(png_str)
sio.seek(0)
img = mpimg.imread(sio)

# plot the image
plt.figure(figsize=(40,15))
imgplot = plt.imshow(img, aspect='equal')
plt.axis('off')
plt.show(block=False)

In [None]:
cf_unified_onlydoc_byname_min2diag['unified.diagnosis_best'].astype('str').unique()

In [None]:
find_columns_matching(cf_unified_onlydoc_byname_min2diag, 'unified')