In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import subprocess
from datetime import date
import re
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('max_colwidth',500)
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
from importlib import reload

## EDIT THESE VARIABLES
analysis_version = "2017_10_19"
project_dir = Path('/Users/rodgersleejg/data/hpc/NNDSP') # needs to be pathlib.Path object

bids_dir = project_dir.joinpath('bids_2017_07_14')

# conf_script = mriqc_dir.joinpath('conf' + analysis_version + '.sh')

mriqc_dir  = project_dir.joinpath('anal/mriqc_files/other_files')
if not mriqc_dir.exists():
    mriqc_dir.mkdir()
output_folder =  project_dir / 'derivatives' / 'mriqc'
if not output_folder.exists():
    output_folder.mkdir()
classifier_output =  output_folder.joinpath('classifier')
if not classifier_output.exists():
    classifier_output.mkdir()
base_work_dir = output_folder.joinpath('work')
if not base_work_dir.exists():
    base_work_dir.mkdir()
log_dir = mriqc_dir.joinpath('swarm_output_' +  analysis_version)
if not log_dir.exists():
    log_dir.mkdir()
manual_qc = output_folder.joinpath('manual_qc_round_2.tsv')
# swarm_path = mriqc_dir.joinpath('mriqc_' + analysis_version + '.cmd')

mriqc_with_predictions = Path('derivatives/mriqc/with_mriqc_predictions.csv')
plottable_data = Path('derivatives/mriqc/classifer_plot_data.pklz')
qc_data = Path('anal/mriqc_files/other_files/qc_pickle_for_v2_exploration.pklz')

%pwd
%cd {project_dir}
%pwd

import anal.python_modules.inner_merge_and_report as pd_custom

# Dependencies

In [None]:
# up to date qc data:
print('Running mriqc_exploration.ipynb from mriqc_performance.ipynb:')
%run 'anal/mriqc_files/analysis_notebooks/mriqc_exploration.ipynb'
# above is dependent on merge_qc_file

### Get performance metrics for the classifier 

In [None]:
df_qc_full = pd.read_pickle(qc_data)
df_qc_full.head()

In [None]:
from anal.python_modules import classification
reload(classification)

In [None]:
from IPython.core.debugger import Pdb; ipdb=Pdb()
df_qc_temp = df_qc_full.copy()
manual_metrics = ['Freesurfer_avg_ext_rating', 'Freesurfer_avg_int_rating', 'MPRAGE']
classifier_metrics = ['tpr','fpr','fdr','fp','tp','fn','tn']
for metric in manual_metrics:
    col_prob = 'prob_y'
    col_true = metric + '_thresholded'
    threshold = 3
    df_qc_temp[col_true] = df_qc_temp[metric] >= threshold
    df_performance = classification.get_classification_scores(df_qc_temp,col_true,col_prob)
#     ipdb.runcall(classification.get_classification_scores,df_qc_temp,col_true,col_prob)
    df_qc_temp.drop(df_performance.columns,axis = 1, inplace=True,errors='ignore')
    df_performance = pd.concat([df_qc_temp, df_performance],axis = 1)
    
del df_qc_temp
df_performance.head()

### Gather value columns together using melt and create labels

Value cols need to all be the same type so that they can be melted to a single columns

In [None]:
t_cols = df_performance.filter(regex = '^(Free|MP).*(' + 'thresholded' + ')').columns
df_performance.loc[:,t_cols] = df_performance.loc[:,t_cols].apply(lambda col:col.astype(float),axis = 0)

tail_of_regex = '|'.join(classifier_metrics) + '|thresholded'
cols_regex = '^(Free|MP).*(' + tail_of_regex + ')'
value_cols = df_performance.filter(regex= cols_regex, axis=1).columns
ids_to_keep = pd.Index(['MASKID','run','prob_y', 'pred_y','threshold'])

print('Regex for value columns to be melted,separated and pivoted: ', cols_regex)
print('ids: ',ids_to_keep,'\n\n\nvalues: ',value_cols)

In [None]:
df_melted = df_performance.melt(id_vars = ids_to_keep,
                var_name= 'binarized_manual_qc_scores',
                            value_name= 'value',
                value_vars= value_cols)
df_melted = (
    pd.concat(
        [df_melted,
        (df_melted.
         binarized_manual_qc_scores.
         str.
         extract(expand=True,
                 pat= '(?P<manual_qc_type>.*)_(?P<value_type>' + tail_of_regex  + ')')
        )],
    axis = 1)
)
df_melted.head()

### Create a column each for the tpr and fpr variables

In [None]:
cols = ['MASKID','run', 'manual_qc_type','prob_y','pred_y','value_type']
df_roc = df_melted[[*cols,'value']].set_index(cols).unstack().reset_index()
cols_from_pivot = df_roc.columns.levels[1][:-1]
df_roc.head()

In [None]:
df_roc.columns = [*cols[:-1], *cols_from_pivot]
df_roc['fpratio'] = df_roc.fp/df_roc.tp
df_roc['positive'] = df_roc.fp + df_roc.tp
df_roc.to_pickle(plottable_data)
df_roc.head()

In [None]:
# df_roc.query('manual_qc_type =="MPRAGE"')
df_roc.query('manual_qc_type =="MPRAGE"').groupby('thresholded').count()

In [None]:
print( plottable_data, 'Size:',len(df_roc.query('manual_qc_type == "MPRAGE"')))