In [1]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import os
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm
sns.set_palette('Dark2')
sns.set_style({'axes.axisbelow': True, 'axes.edgecolor': '.15', 'axes.facecolor': 'white',
               'axes.grid': True, 'axes.labelcolor': '.15', 'axes.linewidth': 1.25, 
               'figure.facecolor': 'white', 'font.family': ['sans-serif'], 'grid.color': '.15',
               'grid.linestyle': ':', 'grid.alpha': .5, 'image.cmap': 'Greys', 
               'legend.frameon': False, 'legend.numpoints': 1, 'legend.scatterpoints': 1,
               'lines.solid_capstyle': 'round', 'axes.spines.right': False, 'axes.spines.top': False,  
               'text.color': '.15',  'xtick.top': False, 'ytick.right': False, 'xtick.color': '.15',
               'xtick.direction': 'out', 'xtick.major.size': 6, 'xtick.minor.size': 3,
               'ytick.color': '.15', 'ytick.direction': 'out', 'ytick.major.size': 6,'ytick.minor.size': 3})
sns.set_context('talk')

#http://phyletica.org/matplotlib-fonts/
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [2]:
from snapanalysis.config import OUTPUT_DIRECTORY as OUTPUT_DIRECTORY_MAIN

OUTPUT_DIRECTORY = os.path.join(OUTPUT_DIRECTORY_MAIN, 'preprocessing')
if not os.path.isdir(OUTPUT_DIRECTORY):
    os.makedirs(OUTPUT_DIRECTORY)

In [26]:
OUTPUT_EXCEL = os.path.join(OUTPUT_DIRECTORY, 'table-heatmap.xlsx')
# Machine readable output
OUTPUT_EXCEL_SHEET_TSV_GZ = os.path.join(OUTPUT_DIRECTORY, 'table-heatmap.sheet.{sheet_no:02}.{sheet_name}.tsv.gz')

In [4]:
from snapanalysis.preprocessing.cleanup.main import OUTPUT_FILE as ENRICHMENT_MODEL_INPUT
from snapanalysis.models.enrichment.generate import RATIO_COLUMN
from snapanalysis.models.enrichment.generate import rotate, EnrichmentDecoposition
from snapanalysis.models.enrichment.generate import OUTPUT_FILE as ENRICHMENT_OUTPUT

from snapanalysis.visualisation.heatmaps.pattern_heatmap import PULLDOWN_ORDER


In [5]:
import importlib
import snapanalysis.visualisation.heatmaps.excel_pattern_heatmap
importlib.reload(snapanalysis.visualisation.heatmaps.excel_pattern_heatmap)
from snapanalysis.visualisation.heatmaps.excel_pattern_heatmap import *

In [6]:
from snapanalysis.preprocessing.pulldown_metadata import OUTPUT_FILE as PD_META_OUTPUT

In [7]:
from snapanalysis.preprocessing.protein_metadata import get_generic_metadata, get_complex_memberships

In [8]:
enrichment_data = pd.read_hdf(ENRICHMENT_OUTPUT, 'enrichment_data')
protein_meta = pd.read_hdf(ENRICHMENT_MODEL_INPUT, 'protein_meta')
gm = get_generic_metadata()
complex_memberships = get_complex_memberships().groupby(level='Gene label').apply(lambda x: '/'.join(sorted(x)))
complex_memberships.name = 'Complex memberships'

protein_meta_for_excel = protein_meta.join(gm).join(complex_memberships)

protein_meta_for_excel = protein_meta_for_excel[[
    'Gene names',
    'Protein names',
    'Complex memberships',
    'Gene names (alternative)',
    'Majority protein IDs',
    'Protein IDs',
]]

protein_meta_essential = protein_meta_for_excel[[
    'Gene names',
    'Protein names',
    'Complex memberships'
]]

In [9]:
enrichment_data['Imputation type'].value_counts()

zero-fill           25836
ratio projection     9619
max enrichment          6
Name: Imputation type, dtype: int64

In [10]:
imputation_mapping = {
    'zero-fill': 1,
    'ratio projection': 2,
    'max enrichment': 3
}

numeric_imputation = enrichment_data['Imputation type'].apply(imputation_mapping.get).unstack('Pull-Down ID')

In [11]:
numeric_imputation.head()

Pull-Down ID,H01,H01M,H02,H03,H03M,H04,H04M,H05,H06,H07,...,H40,H41,H42,H43,H44,H45,H46,H46M,H47,H47M
Gene label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0A087X222,,1.0,,,1.0,,1.0,1.0,,,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
A0A0C4DFX4,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,...,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
A0A0C4DGP2,,,,,,,,,,,...,,,,,,,,,,
A0A0C4DGP5,,,,,,,,2.0,2.0,,...,2.0,,,,,,,,,
AAAS,,1.0,,,1.0,,1.0,,,,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
ratios = enrichment_data[['Ratio H/L normalized (log2) (adjusted, imputed, forward)', 'Ratio H/L normalized (log2) (adjusted, imputed, reverse)']]
ratios.columns = ['Forward', 'Reverse']
ratios.columns.name = 'Direction'

ratios = ratios.unstack('Pull-Down ID')
ratios.columns = ratios.columns.swaplevel()
ratios = ratios.sort_index(axis=1)

ratios = ratios.loc[ratios[ratios != 0].any(axis=1)]


ratios_dedirectionalised = ratios.copy()
ratios_dedirectionalised.loc(axis=1)[:, 'Reverse'] *= -1

In [13]:
from scipy.cluster import hierarchy

linkage = hierarchy.linkage(ratios_dedirectionalised, metric='correlation', method='complete',
                            optimal_ordering=True)

In [14]:
order = hierarchy.dendrogram(linkage, no_plot=True)['leaves']
order = ratios_dedirectionalised.index[order]

In [15]:
ratios = ratios.loc[order]
protein_meta_essential = protein_meta_essential.loc[order]
protein_meta_for_excel = protein_meta_for_excel.loc[order]
numeric_imputation = numeric_imputation.loc[order]

In [16]:
ratios = ratios.reindex(PULLDOWN_ORDER, level='Pull-Down ID', axis=1)

In [17]:
numeric_imputation = numeric_imputation.reindex(ratios.columns, axis=1, level=0)

In [18]:
numeric_imputation

Pull-Down ID,H27M,H27M,H39M,H39M,H39,H39,H07M,H07M,H07,H07,...,H38,H38,H44,H44,H37,H37,H45,H45,H36,H36
Direction,Forward,Reverse,Forward,Reverse,Forward,Reverse,Forward,Reverse,Forward,Reverse,...,Forward,Reverse,Forward,Reverse,Forward,Reverse,Forward,Reverse,Forward,Reverse
Gene label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
CTNNBL1,,,,,2.0,2.0,2.0,2.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CLK4,,,,,1.0,1.0,,,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0
RCC1 (2),1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ZNF395,1.0,1.0,1.0,1.0,,,1.0,1.0,,,...,1.0,1.0,1.0,1.0,,,1.0,1.0,,
LIN9,1.0,1.0,1.0,1.0,,,1.0,1.0,,,...,1.0,1.0,1.0,1.0,,,1.0,1.0,,
MYBL2,1.0,1.0,1.0,1.0,,,1.0,1.0,,,...,1.0,1.0,1.0,1.0,,,1.0,1.0,,
LIN54,1.0,1.0,1.0,1.0,,,1.0,1.0,,,...,1.0,1.0,1.0,1.0,,,1.0,1.0,,
LIN37,1.0,1.0,1.0,1.0,,,1.0,1.0,,,...,1.0,1.0,1.0,1.0,,,1.0,1.0,,
LIN52,1.0,1.0,1.0,1.0,,,1.0,1.0,,,...,1.0,1.0,1.0,1.0,,,1.0,1.0,,
ZNF66,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
color_red = '#7E3E4D'
color_blue = '#41597E'
color_gray = '#F9F9F9'
color_white = '#FFFFFF'
def conditional_format_dict(absmax=4, 
                            reverse=False,
                            color_max=color_red,
                            color_min=color_blue,
                            color_mid=color_white): 
    
    if reverse:
        color_max, color_min = color_min, color_max
        
    
    ans = {
        'type': '3_color_scale',
        'min_type': 'num',
        'mid_type': 'num',
        'max_type': 'num',
        'min_color': color_min,
        'mid_color': color_mid,
        'max_color': color_max,
        'min_value': -absmax,
        'mid_value': 0,
        'max_value': absmax,
    }
    
    return ans

In [20]:
def conditional_format_formula(formula, format_): 
    ans = {
        'type': 'formula',
        'criteria': formula,
        'format': format_,
    }
    
    return ans

In [21]:
import re
def lowercase_alphanumeric(filename):
    
    filename = filename.lower()
    filename = re.sub('[^a-z0-9]+', '_', filename)
    
    return filename

In [22]:
lowercase_alphanumeric('this is a TEST!!!1')

'this_is_a_test_1'

In [27]:
writer = pd.ExcelWriter(OUTPUT_EXCEL, engine='xlsxwriter')
workbook = writer.book

format_bold = workbook.add_format({
    'bold': True
})

border_type = 4
light_border_type = 4

format_right_border = workbook.add_format({
    'right': border_type,
})

format_bold_right_border = workbook.add_format({
    'bold': True,
    'right': border_type,
})

format_bold_right_bottom_border = workbook.add_format({
    'bold': True,
    'right': border_type,
    'bottom': border_type,
})

format_ptm_header = workbook.add_format({
    'bold': True,
    'shrink': True, 
    'valign': 'vcenter',
    'right': border_type,
    'align': 'right'})

format_top_border = workbook.add_format({
    'top': border_type,
})

format_bold_bottom_border = workbook.add_format({
    'bottom': border_type,
    'bold': True,
})

format_top_border_format_numeric = workbook.add_format({
    'top': border_type,
    'num_format': '0.00',
    'font_size': 8,
})

format_top_border_format_numeric_light_border = workbook.add_format({
    'top': border_type,
    'num_format': '0.00',
    'font_size': 8,
    'right': light_border_type,
})


format_numeric = workbook.add_format({
    'num_format': '0.00',
    'font_size': 8,
})

format_numeric_light_right_border = workbook.add_format({
    'num_format': '0.00',
    'font_size': 8,
    'right': light_border_type,
})


format_ratio_imputed = workbook.add_format({
    'bg_color': '#9EDEA8'
})

format_max_imputed = workbook.add_format({
    'bg_color': '#F0CF8F'
})

format_zero_imputed = workbook.add_format({
    'bg_color': '#B099E2'
})

format_imputed_heatmap_cell = workbook.add_format({
    'pattern': 18,
})

format_imputed_heatmap_cell_legend = workbook.add_format({
    'pattern': 18,
    'fg_color': 'black',
    'bg_color': 'white',
})


format_bold_centered = workbook.add_format({
    'bold': True,
    'align': 'center'
})



sheet_name = 'Heatmap'

start_row_header = 0
start_col_heatmap = len(protein_meta_essential.columns) + 3
end_col_heatmap = start_col_heatmap + len(ratios.columns)
start_row_data = 15
end_row_data = start_row_data + len(ratios) -1 

# -- Heatmap header -- 
excel_predictors_header(ratios.columns, writer, 
                        sheet_name=sheet_name, 
                        start_row=start_row_header, 
                        start_col=start_col_heatmap-1,
                        format_ptm_header=format_ptm_header,
                        write_direction=False)

worksheet = writer.sheets[sheet_name]

# -- Heatmap data ---

df = ratios.copy()
# Pandas struggles with multiindex
df.columns = ['-'.join(c) for c in df.columns]

# Do not write the first row with pandas as we need a different formatting.
df.iloc[1:].to_excel(writer, sheet_name=sheet_name,
                     header=False,
                     index=False,
                     startcol=start_col_heatmap, 
                     startrow=start_row_data+1)

df.to_csv(OUTPUT_EXCEL_SHEET_TSV_GZ.format(sheet_no=1, sheet_name=lowercase_alphanumeric(sheet_name)),
         sep='\t', index=True)

# Write first row with custom formatting:
even = True
for col_number, value in  enumerate(df.iloc[0], start=start_col_heatmap):
    
    if even:
        format_ = format_top_border_format_numeric
    else:
        format_ = format_top_border_format_numeric_light_border
        
    even = not even
    
    worksheet.write(start_row_data, col_number, value, format_)
    

# Conditional formatting

for col_number, col in enumerate(ratios.columns, start=start_col_heatmap): 
    reverse = col[1] == 'Reverse'
    
    # Forward
    worksheet.conditional_format(
        start_row_data, col_number, 
        end_row_data, col_number,
        conditional_format_dict(reverse=reverse)
    )

# --  Essential metadata --- 

pme = protein_meta_essential.reset_index()
pme = pme[list(protein_meta_essential.columns) + [protein_meta_essential.index.name]]
pme.to_excel(writer, sheet_name=sheet_name, 
             startcol=2, startrow=start_row_data, 
             header=False, index=False)


pme.to_csv(OUTPUT_EXCEL_SHEET_TSV_GZ.format(sheet_no=1, sheet_name=lowercase_alphanumeric(sheet_name + '_metadata')),
           sep='\t', index=False)

# Metadata header
for i, col in enumerate(pme.columns, start=2):
    
    format_ = format_bold_bottom_border
    
    if i == len(pme.columns) -1 :
        format_ = format_bold_right_bottom_border
        
    worksheet.write(start_row_data-1, i, col, format_)

    
# Legend

worksheet.merge_range(2, 1, 2, 2, 'Ratio H/L (log2)', format_bold)

# Ratio H/L
worksheet.write(3, 1, -4)
worksheet.write(3, 2, 'Strong exclusion')
worksheet.write(4, 1, -2)
worksheet.write(4, 2, 'Exclusion')
worksheet.write(5, 1, 0)
worksheet.write(5, 2, 'No response')
worksheet.write(6, 1, 2)
worksheet.write(6, 2, 'Recruitment')
worksheet.write(7, 1, 4)
worksheet.write(7, 2, 'Strong recruitment')   

worksheet.conditional_format(
    3, 1, 
    7, 1,
    conditional_format_dict()
)

worksheet.merge_range(9, 1, 9, 2, 'Other', format_bold)
worksheet.write(10, 1, '', format_imputed_heatmap_cell_legend)
worksheet.write(10, 2, 'Imputed datapoint')              
                

# Legend
worksheet.set_column(0, 0, 1.5, None)
worksheet.set_column(1, 1, 3, None)

# Gene name
worksheet.set_column(2, 2, 15, None)
# Protein name
worksheet.set_column(3, 3, 25, None)
# Complex name
worksheet.set_column(4, 4, 15, None)
# Gene label
worksheet.set_column(5, 5, 15, format_right_border)



# Heatmap
even = True
for col_i in range(start_col_heatmap, end_col_heatmap+1):
    
    if even:
        format_ = format_numeric
    else:
        format_ = format_numeric_light_right_border
        
    even = not even
    
    worksheet.set_column(col_i, end_col_heatmap, 2.5, format_)

# Other
# worksheet.set_row(start_row_data, start_row_data, format_top_border_format_numeric)
# worksheet.set_row(end_row_data+1, end_row_data+1, format_top_border_format_numeric)

worksheet.freeze_panes(start_row_data, start_col_heatmap)

# -- Protein metadata ---

pmfe = protein_meta_for_excel.reset_index().sort_values(by='Gene label')
pmfe.to_excel(writer, 
              sheet_name='List of proteins', 
              startrow=1, header=False, index=False)


pmfe.to_csv(OUTPUT_EXCEL_SHEET_TSV_GZ.format(sheet_no=2, sheet_name=lowercase_alphanumeric('List of proteins')),
            sep='\t', index=False)

sheet_protein = writer.sheets['List of proteins']
for i, col in enumerate(pmfe.columns):
    if i == 0:
        format_ = format_bold_right_bottom_border
    else:
        format_ = format_bold_bottom_border
        
    sheet_protein.write(0, i, col, format_)

# Gene label
sheet_protein.set_column(0, 0, 15, format_bold)
# Gene names
sheet_protein.set_column(1, 1, 30)
# Protein names
sheet_protein.set_column(2, 2, 50)
# Complex memberships
sheet_protein.set_column(3, 3, 30)
# Gene nams (alternative)
sheet_protein.set_column(4, 4, 30)

# IDs
sheet_protein.set_column(5, 6, 50)

sheet_protein.freeze_panes(1,1)

# -- Imputation --
sheet_name_imputation = 'Imputation type'

excel_predictors_header(numeric_imputation.columns, writer, 
                        sheet_name=sheet_name_imputation, 
                        start_row=start_row_header, 
                        start_col=start_col_heatmap-1,
                        format_ptm_header=format_ptm_header,
                        write_direction=True)

worksheet_imputation = writer.sheets[sheet_name_imputation]

df = numeric_imputation.copy()
# Pandas struggles with multiindex
df.columns = ['-'.join(c) for c in df.columns]

df.to_excel(writer, sheet_name=sheet_name_imputation,
             header=False,
             index=False,
             startcol=start_col_heatmap, 
             float_format='%.0f',
             startrow=start_row_data)

df.to_csv(OUTPUT_EXCEL_SHEET_TSV_GZ.format(sheet_no=3, sheet_name=lowercase_alphanumeric(sheet_name_imputation)),
          sep='\t', index=True)


for row_number, value in enumerate(df.index, start=start_row_data):
    worksheet_imputation.write(row_number, start_col_heatmap-1, value, format_bold)
    

# Legend
worksheet_imputation.merge_range(1, 1, 1, 2, 'Legend', format_bold_centered)

worksheet_imputation.write(2, 1, 'Value', format_bold)
worksheet_imputation.write(2, 2, 'Imputation type', format_bold)

worksheet_imputation.write(3, 1, '<empty>')
worksheet_imputation.write(3, 2, 'No imputation')

worksheet_imputation.write(4, 1, 1)
assert imputation_mapping['zero-fill'] == 1
worksheet_imputation.write(4, 2, 'Imputation with zero')

worksheet_imputation.write(5, 1, 2)
assert imputation_mapping['ratio projection'] == 2
worksheet_imputation.write(5, 2, 'Imputation using other H/L ratio')

worksheet_imputation.write(6, 1, 3)
assert imputation_mapping['max enrichment'] == 3
worksheet_imputation.write(6, 2, 'Imputation using maximum value for PD')

# Conditional formatting

# -- legend ---
worksheet_imputation.conditional_format(3, 1, 6, 1,
                                        conditional_format_formula(
                                            formula='=B4=2',
                                            format_=format_ratio_imputed,
                                        )
                                       )
worksheet_imputation.conditional_format(3, 1, 6, 1,
                                        conditional_format_formula(
                                            formula='=B4=3',
                                            format_=format_max_imputed,
                                        )
                                       )

worksheet_imputation.conditional_format(3, 1, 6, 1,
                                        conditional_format_formula(
                                            formula='=B4=1',
                                            format_=format_zero_imputed,
                                        )
                                       )

# -- HEATMAP ---
worksheet_imputation.conditional_format(start_row_data, start_col_heatmap, end_row_data, end_col_heatmap,
                                        conditional_format_formula(
                                            formula='=G16=2',
                                            format_=format_ratio_imputed,
                                        )
                                       )
worksheet_imputation.conditional_format(start_row_data, start_col_heatmap, end_row_data, end_col_heatmap,
                                        conditional_format_formula(
                                            formula='=G16=3',
                                            format_=format_max_imputed,
                                        )
                                       )

worksheet_imputation.conditional_format(start_row_data, start_col_heatmap, end_row_data, end_col_heatmap,
                                        conditional_format_formula(
                                            formula='=G16=1',
                                            format_=format_zero_imputed,
                                        )
                                       )




worksheet_imputation.set_column(0, 0, 1.5)
worksheet_imputation.set_column(1, 1, 5)
worksheet_imputation.set_column(2, 2, 30)
worksheet_imputation.set_column(3, 4, None, None, {'hidden': True})
 
worksheet_imputation.set_column(5, 5, 15, None)
worksheet_imputation.set_column(start_col_heatmap, end_col_heatmap, 2.5, None)

worksheet_imputation.freeze_panes(start_row_data, start_col_heatmap)

# Conditional formatting for imputed values in heatmap
worksheet.conditional_format(start_row_data, start_col_heatmap, end_row_data, end_col_heatmap,
                             conditional_format_formula(
                                formula='=\'Imputation type\'!G16>0',
                                format_=format_imputed_heatmap_cell,
                            )
                           )

# for i, pattern in enumerate(range(19), start=10):
#     fmt = workbook.add_format({'pattern': pattern, 'fg_color': 'black', 'bg_color': 'white'})
    
#     worksheet_imputation.write(i, 0, pattern, fmt)

writer.save()

In [24]:
print('Done, whew')

Done, whew
