![title](static/wcar.png)

# Visualising Overexpression Library Hits

In [None]:
#reload when modified
%load_ext autoreload
%autoreload 2

In [None]:
import svist4get as sv4g
import pandas as pd
from IPython.display import Image, display
import os

In [None]:
pa = sv4g.manager.Parameters()

## Load tracks in BED graph format

In [None]:
_EXPERIMENT = '{_EXPERIMENT}'
_FASTQ_HEADER = '{_FASTQ_HEADER}'
_TOP_10 = pd.read_csv(os.path.join(_EXPERIMENT,'res.csv'))
_TOP_10 = list(_TOP_10.sort_values('counts',ascending=False).head(10)['gene_id'])
_TOP_10

In [None]:
_ABS_PATH = os.path.join(os.sep,'homes','mtinti','RNAseq','viper-test','rit-seq')
_GFF = os.path.join('genomes','{genome}','{gff}')
_GTF = os.path.join('genomes','{genome}','{gtf}')
_GENOME = os.path.join('genomes','{genome}','{fasta}')


exp_dict = {
    _EXPERIMENT:dict(
        path_to_config='svist4get/svist4get_data/A4_p2.cfg',
        gtf_file=  os.path.join(_ABS_PATH,_GTF),
        fasta_file= os.path.join(_ABS_PATH,_GENOME),
        bedgraph_label = ['ALL', 'FF', 'FR', 'RF', 'RR'],
        paths_to_bedgraphs = [
            os.path.join(_ABS_PATH,_EXPERIMENT,'data',_FASTQ_HEADER,_FASTQ_HEADER+'_coverage_bg.bed'),
            os.path.join(_ABS_PATH,_EXPERIMENT,'data',_FASTQ_HEADER,_FASTQ_HEADER+'ff_barcode_coverage_bg.bed'),
            os.path.join(_ABS_PATH,_EXPERIMENT,'data',_FASTQ_HEADER,_FASTQ_HEADER+'fr_barcode_coverage_bg.bed'),
            os.path.join(_ABS_PATH,_EXPERIMENT,'data',_FASTQ_HEADER,_FASTQ_HEADER+'rf_barcode_coverage_bg.bed'),
            os.path.join(_ABS_PATH,_EXPERIMENT,'data',_FASTQ_HEADER,_FASTQ_HEADER+'rr_barcode_coverage_bg.bed')
        ],
    )
}
exp_dict

## Extract interesting region
We try to get the interesting gene plus a couple of genes more at the 5' and 3' of the interesting gene

In [None]:
#set up the main params
def init_pa(exp='RNAseqInVivo_dh'):
    pa = sv4g.manager.Parameters()
    path_to_config = exp_dict[exp]['path_to_config']
    pa.initialize(path_to_config)
    pa.config['gtf_file'] = exp_dict[exp]['gtf_file']
    pa.config['fasta_file'] = exp_dict[exp]['fasta_file']
    pa.config['bedgraph'] =  exp_dict[exp]['paths_to_bedgraphs']
    pa.config['bedgraph_label'] = exp_dict[exp]['bedgraph_label']
    pa.config['bedgraph_label_position'] = 'left'
    return pa

#parse the gtf file so that can be serched
def parse_gtf(pa):
    gtf = pd.read_csv(pa.config['gtf_file'], sep='\t', header=None)
    gtf.columns = ['chro','source','ftype','start','end','score','strand','frame','info']
    gtf['gene'] = [n.split(';')[0].split(' ')[-1].split(':')[0].strip('\"') for n in gtf['info']]
    return gtf

#extract the gene of interest +- n genes  
#at 5' and 3' of the gene of interest
def get_region(in_gene, pa, extend=2):
    gtf = parse_gtf(pa)
    temp = gtf.drop_duplicates(subset=['chro','gene'])
    temp = temp[temp.iloc[:,2]=='transcript']
    
    selection = temp[temp['gene'].str.contains(in_gene)].index.values[0]
    #print(selection)
    #print('____________')
    temp = temp[temp['chro']==temp.loc[selection]['chro']]
    temp = temp.sort_values('start')
    
    #print(temp.head())
    #print('____________')
    #print(temp.tail())
    #print('____________')
    
    strand = temp.loc[selection]['strand']
    chrom = temp.loc[selection]['chro']
    temp['old_index']=temp.index.values
 
    
    temp = temp.reset_index(drop=True)

    #print(temp.head())
    #print('____________')       
    
    new_pos = temp[temp.old_index==selection].index.values[0]
    
    
    from_index = new_pos-extend-1
    to_index = new_pos+extend+1
    #print(temp.head())
    #print('____________')
    #print(temp.loc[selection])
    
    #we ask for an array of gene
    #we take n genes at the 5' and n genes at the 3' of the selected gene where n=extend 
    #we also add one further gene at the start and end of the array
    #print(from_index, to_index)
    
    #check if we are at the start of the chr
    if from_index<0:
        from_index=0
        temp = temp.loc[from_index:to_index]
        #start of plot is the start of the gene of interest
        #not ideal
        start = temp['start'].values[0]
        end = temp['start'].values[-1]
    
    #or end of the chromosome    
    if to_index >= temp.index.values[-1]:
        to_index = temp.index.values[-1]
        temp = temp.loc[from_index:to_index]
        start = temp['end'].values[0]
        #end of plot is the end of the gene of interest
        #not ideal        
        end = temp['end'].values[-1]
        #print('11111111111')

    #otherwise    
    temp = temp.loc[from_index:to_index]
    #the start of the plot is the end of the first gene in the array
    start = temp['end'].values[0]
    #the end of the plot is the start of the last gene in the array
    end = temp['start'].values[-1]
    #print(temp)
    #print(1, 'extend', extend)
    return chrom, start, end, strand

#set up the gene-dependent parameters
def add_gene(gene_id, desc,  pa, extend=2):
    chrom, start, end, strand = get_region(gene, pa, extend=extend) 
    print(chrom, start, end, strand)
    pa.config['window'] = [chrom, start, end]
    pa.config['image_title'] = gene
    pa.config['output_filename'] = gene
    pa.config['image_title'] = gene+' '+desc
    pa.config['output_filename'] = os.path.join(_EXPERIMENT, gene)
    #determine the orientation of the plot
    
    if strand == '+':
        pa.config['revcomp_transform'] = 0
    else:
        pa.config['revcomp_transform'] = 1
    
    return pa
    #print(2, 'extend', extend)
 
# make the figure
def make_image(pa):
    gtf = sv4g.data_processing.Gtf_helper(pa.config['gtf_file'])
    transcripts = gtf.extract_transcripts_from_widnow(*pa.config['window'])
    data_from_gtf = (gtf.extract_data_about_transcripts(transcripts))
    pa.add_gtf_data(data_from_gtf)
    tracks = []
    tracks += sv4g.manager.Title_tracks_maker(pa).create_tracks()
    tracks += sv4g.manager.Axis_tics_tracks_maker(pa).create_tracks()
    tracks += sv4g.manager.Vgrid_tracks_maker(pa).create_tracks()
    tracks += sv4g.manager.Aa_seq_tracks_maker(pa).create_tracks()
    tracks += sv4g.manager.Transcript_struct_tracks_maker(pa).create_tracks()
    tracks += sv4g.manager.Bedgraph_tracks_maker(pa).create_tracks()
    sv4g.manager.Image(tracks, pa).draw()
    # converting the resulting pdf to a png file
    sv4g.methods.pdf_page_to_png(pa)
    display(Image(filename=os.path.join(pa.config['output_filename']+'.png')))



## Description Dict

In [None]:
#create a dictionary of gene to desc
#from the gff file
def make_desc():
    gff =pd.read_csv( _GFF, sep='\t', header=None,comment='#')
    gff = gff[gff.iloc[:,2]=='gene']
    desc = {}
    for n in gff.iloc[:,-1]:
        n=n.replace('%2C',' ')
        item_list = n.split(';')
        #print (item_list)
        temp_dict = {}
        for m in item_list:
            #print(m)
            temp_dict[m.split('=')[0].strip()]=m.split('=')[1].strip()
        #print(temp_dict['ID'])
        #print(temp_dict['description'])
        desc[temp_dict['ID']]=temp_dict['description']
    return desc

desc_dict = make_desc()

In [None]:
gene = _TOP_10[0]
gene, desc_dict[gene]

## Plots

In [None]:
for gene in _TOP_10:
    print(gene,desc_dict[gene])
    try:
        desc = ' '+desc_dict[gene]#' Succinate dehydrogenase'
        pa = init_pa(exp=_EXPERIMENT)
        pa = add_gene(gene, desc, pa, extend=2)
        make_image(pa)
    except:
        print(gene,'--------- ERROR ----------')
        