In [1]:
# you need to download and extract the SHARPR package for this to work
# https://ernstlab.biolchem.ucla.edu/SHARPR/SHARPR.zip
# read the manual if something doesn't work

import tempfile
import os
import numpy as np
import pandas as pd

def compute_sharpr_from_tiles(tile_fn, varprior_1=1, varprior_2=None):
    """ Computes SHARPR scores from tile matrix.

        Format of tile_fn is the same as SHARPR tile matrix format, tab-separated,
        rows are regions and columns are tiles. 
    """
    tilelength = 145
    stepsize = 5
    numtilepos = 31

    if varprior_2 is None:
        varprior_2 = varprior_1
    
    with tempfile.TemporaryDirectory() as tmpdir:
        inputtablefile = tile_fn
        inferenceoutputfile_1 = f'{tmpdir}/inference_out.varprior_{varprior_1}.tsv'
        
        cmd_ = f"java -jar /u/home/m/mardren/scratch/SHARPR/SHARPR.jar Infer {inputtablefile} {inferenceoutputfile_1} {varprior_1} {tilelength} {stepsize} {numtilepos}"
        os.system(cmd_)
        
        inferenceoutputfile_2 = f'{tmpdir}/inference_out.varprior_{varprior_2}.tsv'
        
        cmd_ = f"java -jar /u/home/m/mardren/scratch/SHARPR/SHARPR.jar Infer {inputtablefile} {inferenceoutputfile_2} {varprior_2} {tilelength} {stepsize} {numtilepos}"
        os.system(cmd_)
        
        fileset1 = f"{inferenceoutputfile_1}"
        fileset2 = f"{inferenceoutputfile_2}"
        
        combineoutputfile = f'{tmpdir}/combineoutput.tsv'
        
        cmd_ = f"java -jar /u/home/m/mardren/scratch/SHARPR/SHARPR.jar Combine -c {fileset2} {fileset1} {combineoutputfile}"
        os.system(cmd_)
        
        interpolateinputfile = combineoutputfile
        interpolateoutputfile = f'{tmpdir}/interpolate_out.tsv'
        
        cmd_ = f"java -jar /u/home/m/mardren/scratch/SHARPR/SHARPR.jar Interpolate {interpolateinputfile} {interpolateoutputfile} {stepsize}"
        os.system(cmd_)
        
        df = pd.read_csv(interpolateoutputfile,header=None,sep = '\t',index_col=0)
        df.columns = df.columns.map(int) - 1
        df.index = df.index.rename('region_id')
        
        return df
    
if __name__ == "__main__":
    df = compute_sharpr_from_tiles('/u/home/m/mardren/scratch/SHARPR/sequence_predictions.tsv')
    
    

In [10]:
import pandas as pd
df = pd.read_csv('/u/home/m/mardren/scratch/SHARPR/conv_even.tsv',header=None,index_col=0,sep = '\t')

In [2]:
df 

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,285,286,287,288,289,290,291,292,293,294
region_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
H1hesc_10_11_chr8_41583315,0.226,0.226,0.226,0.253,0.280,0.306,0.333,0.360,0.353,0.346,...,-0.223,-0.211,-0.199,-0.177,-0.155,-0.132,-0.110,-0.088,-0.088,-0.088
H1hesc_10_3_chr8_142524515,-0.009,-0.009,-0.009,-0.006,-0.004,-0.001,0.001,0.004,-0.003,-0.010,...,0.131,0.113,0.095,0.078,0.062,0.045,0.029,0.012,0.012,0.012
H1hesc_12_102_chr8_123689875,0.196,0.196,0.196,0.224,0.252,0.279,0.307,0.335,0.349,0.363,...,-0.060,-0.058,-0.056,-0.059,-0.062,-0.064,-0.067,-0.070,-0.070,-0.070
H1hesc_12_107_chr8_23331975,0.492,0.492,0.492,0.640,0.788,0.935,1.083,1.231,1.299,1.366,...,-0.206,-0.197,-0.188,-0.168,-0.147,-0.127,-0.106,-0.086,-0.086,-0.086
H1hesc_12_108_chr8_27445875,-0.102,-0.102,-0.102,-0.122,-0.143,-0.163,-0.184,-0.204,-0.228,-0.251,...,0.133,0.137,0.141,0.138,0.135,0.131,0.128,0.125,0.125,0.125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
K562_9_395_chr18_48395095,-0.048,-0.048,-0.048,-0.047,-0.047,-0.046,-0.046,-0.045,-0.045,-0.045,...,0.045,0.003,-0.039,-0.031,-0.023,-0.014,-0.006,0.002,0.002,0.002
K562_9_66_chr18_68095895,-0.089,-0.089,-0.089,-0.084,-0.080,-0.075,-0.071,-0.066,-0.079,-0.092,...,0.116,0.107,0.098,0.108,0.118,0.127,0.137,0.147,0.147,0.147
K562_9_67_chr18_2222715,-0.081,-0.081,-0.081,-0.083,-0.085,-0.086,-0.088,-0.090,-0.083,-0.075,...,-0.094,-0.084,-0.074,-0.062,-0.050,-0.037,-0.025,-0.013,-0.013,-0.013
K562_9_81_chr18_32905495,0.091,0.091,0.091,0.095,0.099,0.102,0.106,0.110,0.119,0.128,...,-0.215,-0.202,-0.189,-0.164,-0.139,-0.113,-0.088,-0.063,-0.063,-0.063


In [8]:
df = df.reset_index()
df['chrom'] = df['region_id'].str.split('_').str[3]
test_chroms = [f'chr{i}' for i in range(2,23,2)]
df = df[df["chrom"].isin(test_chroms)]

In [10]:
df = df.set_index('region_id',drop=True)

In [3]:
df.to_csv('/u/home/m/mardren/scratch/SHARPR/sharprscores_sequence_predictions.tsv',sep='\t', header=False)

In [25]:
import matplotlib.pyplot as plt
#generate heatmap
start = 1
matrix = df[start:start+295:1]
# matrix = matrix.sort_values(by = [15])
plt.figure(figsize = (200,200))
plt.imshow(matrix)
plt.title("Predicted SHARPR Scores")
plt.xlabel("Tile")
plt.xticks(np.arange(295),matrix.columns)
plt.ylabel("Region")
plt.yticks(np.arange(295),matrix.index)
plt.show()

MemoryError: Unable to allocate 902. MiB for an array with shape (10872, 10872) and data type float64

<Figure size 14400x14400 with 1 Axes>