In [None]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

In [None]:
def get_attributes(att=''):
    attributes = {}
    for n in att.split(';'):
        attributes[n.split('=')[0]]=n.split('=')[1]
    return attributes

# polycistrons

In [None]:
infile=os.path.join('TriTrypDB-51_TbruceiTREU927.gff')
#read the GFF file
gff = pd.read_table(infile,header=None,comment='#',
                    names = ['seqname','source',
                             'feature','start',
                             'end','score',
                             'strand','frame','attribute'])
#keep only gene data
gff=gff[gff.feature=='gene']
#extract gene name from attributes
gff['gene_id']=[get_attributes(n)['ID'] for n in gff.attribute]
#remove what we do not need
gff.drop(['attribute','score','frame','source','feature'],inplace=True,axis=1)
#be sure that everithing is sorted
gff.sort_values(['seqname','start'],inplace=True)

#print(gff.head())
chr_dict = {}
for chro in tqdm(gff.seqname.unique()):
    chr_dict[chro]=gff[gff['seqname']==chro]
  
count=0
#detect strand shift in chromosomes
for chro in tqdm(gff.seqname.unique()):
    #shift the strand by 1
    chr_dict[chro]['strand_shift']=chr_dict[chro].strand.shift(1)
    #fill the na with the values of previus strand
    chr_dict[chro]['strand_shift'] = chr_dict[chro]['strand_shift'].fillna(
        chr_dict[chro]['strand'][chr_dict[chro]['strand_shift'].isna()])
    chr_dict[chro]['change']=chr_dict[chro]['strand_shift']==chr_dict[chro]['strand']
    #print(chr_dict[chro].head())
    temp = []
    #find were the strand change to create a
    #unique id for each polycistron
    for a in chr_dict[chro].change:
        if a:
            temp.append(count)
        else:
            temp.append(count+1)
            count+=1
    #add one so the polycistron id is 
    #different when we change cromosome
    count+=1
    chr_dict[chro]['polycistron_id']=temp
    #print(chr_dict[chro].head())
    
gff = pd.concat([chr_dict[chro] for chro in gff.seqname.unique()])
gff.head() 

HBox(children=(FloatProgress(value=0.0, max=111.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=111.0), HTML(value='')))




Unnamed: 0,seqname,start,end,strand,gene_id,strand_shift,change,polycistron_id
42748,11L3_v3,23858,29262,-,Tb07.11L3.90,-,True,0
42946,11L3_v3,30224,48500,-,Tb07.11L3.100,-,True,0
16895,5K5_v5.1,3,97,-,Tb05.5K5.10,-,True,1
5795,5K5_v5.1,150,2405,-,Tb05.5K5.20,-,True,1
34797,5K5_v5.1,3300,7701,-,Tb05.5K5.30,-,True,1


In [None]:
main_chro = set([
    'Tb927_01_v5.1', 'Tb927_02_v5.1',
    'Tb927_03_v5.1', 'Tb927_04_v5.1', 
    'Tb927_05_v5.1', 'Tb927_06_v5.1',
    'Tb927_07_v5.1', 'Tb927_08_v5.1', 
    'Tb927_09_v5.1', 'Tb927_10_v5.1',
    'Tb927_11_v5.1'])
gff=gff[gff['seqname'].isin(main_chro)]

In [None]:
gff.groupby('seqname')['start'].min()

seqname
Tb927_01_v5.1     381
Tb927_02_v5.1      10
Tb927_03_v5.1      27
Tb927_04_v5.1    1439
Tb927_05_v5.1     965
Tb927_06_v5.1       1
Tb927_07_v5.1       2
Tb927_08_v5.1       2
Tb927_09_v5.1    4758
Tb927_10_v5.1    1486
Tb927_11_v5.1    2911
Name: start, dtype: int64

In [None]:
#change min to zero
for chro in main_chro:
    temp = gff[gff['seqname']==chro]
    gff.loc[temp[['start']].idxmin(),'start']=1

In [None]:
max_dict = {}
for n in open(infile):
    if n.startswith('#'):
        chro = n.strip().split(' ')[1]
        if chro in main_chro:
            max_dict[chro]=int(n.strip().split(' ')[-1])
    else:
        break
max_dict    

{'Tb927_01_v5.1': 1064672,
 'Tb927_02_v5.1': 1193948,
 'Tb927_03_v5.1': 1653225,
 'Tb927_04_v5.1': 1590432,
 'Tb927_05_v5.1': 1802303,
 'Tb927_06_v5.1': 1618915,
 'Tb927_07_v5.1': 2205233,
 'Tb927_08_v5.1': 2481190,
 'Tb927_09_v5.1': 3542885,
 'Tb927_10_v5.1': 4144375,
 'Tb927_11_v5.1': 5223313}

In [None]:
#change max to chr length
for chro in main_chro:
    temp = gff[gff['seqname']==chro]
    gff.loc[temp[['end']].idxmax(),'end']=max_dict[chro]

In [None]:
gff.groupby('seqname')['end'].max()

seqname
Tb927_01_v5.1    1064672
Tb927_02_v5.1    1193948
Tb927_03_v5.1    1653225
Tb927_04_v5.1    1590432
Tb927_05_v5.1    1802303
Tb927_06_v5.1    1618915
Tb927_07_v5.1    2205233
Tb927_08_v5.1    2481190
Tb927_09_v5.1    3542885
Tb927_10_v5.1    4144375
Tb927_11_v5.1    5223313
Name: end, dtype: int64

In [None]:
#extend polycistron from last registred gene to the start of next gene in opposite direction
polycistrons = pd.concat([gff.groupby(['seqname','polycistron_id','strand'])['start'].min(),
           gff.groupby(['seqname','polycistron_id','strand'])['end'].max()],axis=1).reset_index()

In [None]:
polycistrons

Unnamed: 0,seqname,polycistron_id,strand,start,end
0,Tb927_01_v5.1,20,-,1,1983
1,Tb927_01_v5.1,21,+,6871,7203
2,Tb927_01_v5.1,22,-,7554,8566
3,Tb927_01_v5.1,23,+,11158,11865
4,Tb927_01_v5.1,24,-,12407,59047
...,...,...,...,...,...
358,Tb927_11_v5.1,607,+,5059560,5074772
359,Tb927_11_v5.1,608,-,5082133,5093502
360,Tb927_11_v5.1,609,+,5089938,5130296
361,Tb927_11_v5.1,610,-,5130638,5133342


In [None]:
#fix end chro for last time
list_df = []
for chro in main_chro:
    temp_df = polycistrons[polycistrons['seqname']==chro]
    temp_df['end']=temp_df['start'].shift(-1)-1
    temp_df = temp_df.fillna(max_dict[chro])
    list_df.append(temp_df)

polycistrons = pd.concat(list_df)
polycistrons.head()


Unnamed: 0,seqname,polycistron_id,strand,start,end
25,Tb927_02_v5.1,45,-,1,9689.0
26,Tb927_02_v5.1,46,+,9690,23834.0
27,Tb927_02_v5.1,47,-,23835,118322.0
28,Tb927_02_v5.1,48,+,118323,140724.0
29,Tb927_02_v5.1,49,-,140725,151836.0


In [None]:
polycistrons['end']=polycistrons['end'].astype(int)
polycistrons['polycistron_id']=range(1,polycistrons.shape[0]+1)
polycistrons.head()

Unnamed: 0,seqname,polycistron_id,strand,start,end
25,Tb927_02_v5.1,1,-,1,9689
26,Tb927_02_v5.1,2,+,9690,23834
27,Tb927_02_v5.1,3,-,23835,118322
28,Tb927_02_v5.1,4,+,118323,140724
29,Tb927_02_v5.1,5,-,140725,151836


In [None]:
#format_circos
polycistrons['start']=polycistrons['start']-1
polycistrons['end']=polycistrons['end']-1
polycistrons.head()

Unnamed: 0,seqname,polycistron_id,strand,start,end
25,Tb927_02_v5.1,1,-,0,9688
26,Tb927_02_v5.1,2,+,9689,23833
27,Tb927_02_v5.1,3,-,23834,118321
28,Tb927_02_v5.1,4,+,118322,140723
29,Tb927_02_v5.1,5,-,140724,151835


# karyotype BANDS

In [None]:
circos_bands = pd.DataFrame()
circos_bands['start']=polycistrons['start']
circos_bands['end']=polycistrons['end']
circos_bands['band']='band'
circos_bands['chro']=polycistrons['seqname']
circos_bands['band_id']=circos_bands['chro']+'_'+circos_bands['band']
circos_bands['color']=['black' if n == '-' else 'white' for n in polycistrons['strand']]
circos_bands=circos_bands[['band','chro','band_id','band_id','start','end','color']]
circos_bands.to_csv('karyotype.tryp_2.txt',index=False,header=False,sep=' ')
circos_bands.head()
#band hs1 p36.33 p36.33 0 2300000 gneg
#band hs1 p36.32 p36.32 2300000 5300000 gpos25

Unnamed: 0,band,chro,band_id,band_id.1,start,end,color
25,band,Tb927_02_v5.1,Tb927_02_v5.1_band,Tb927_02_v5.1_band,0,9688,black
26,band,Tb927_02_v5.1,Tb927_02_v5.1_band,Tb927_02_v5.1_band,9689,23833,white
27,band,Tb927_02_v5.1,Tb927_02_v5.1_band,Tb927_02_v5.1_band,23834,118321,black
28,band,Tb927_02_v5.1,Tb927_02_v5.1_band,Tb927_02_v5.1_band,118322,140723,white
29,band,Tb927_02_v5.1,Tb927_02_v5.1_band,Tb927_02_v5.1_band,140724,151835,black



# karyotype IDEOGRAM

In [None]:
circos_chro = pd.DataFrame.from_dict(max_dict,orient="index")
circos_chro=circos_chro.reset_index()
circos_chro.columns=['chro','end']
circos_chro['start']=0
circos_chro['end']=circos_chro['end']-1
circos_chro['tag']='-'
circos_chro['type']='chr'
#circos_chro['id'] = ['Chr'+n.split('_')[1] for n in circos_chro['chro']]
circos_chro['id'] = [n.split('_')[1].lstrip('0') for n in circos_chro['chro']]
circos_chro['color'] = 'grey'
circos_chro = circos_chro[['type','tag','chro','id','start','end','color']]

circos_chro.to_csv('karyotype.tryp_1.txt',index=False,header=False,sep=' ')

In [None]:
with open('karyotype.tryp.txt','w') as out:
    out.write(open('karyotype.tryp_1.txt').read()+open('karyotype.tryp_2.txt').read())

# Sence

In [None]:
plus_cols = [
    'BSF_03_F',
    'BSF_03_RR',
'BSF_04_F',
'BSF_04_RR',
'BSF_05_F',
'BSF_05_RR']

minus_cols = [
'BSF_03_R',
'BSF_03_FR',
'BSF_04_R',
'BSF_04_FR',
'BSF_05_R',
'BSF_05_FR']

bla_df= pd.read_csv('../final_df_B.csv')
print(bla_df.shape)
bla_df['sum'] = bla_df[plus_cols].sum(axis=1)
temp_plus = bla_df[bla_df['orientation']=='+']
temp_minus = bla_df[bla_df['orientation']=='-']

scatter_df = pd.concat([temp_plus,temp_minus])
scatter_df = scatter_df[scatter_df['Chr'].isin(main_chro)]
scatter_df[scatter_df['get_region']!='none']
scatter_df['sum'] = np.log10(scatter_df['sum']+1)
scatter_df[['Chr','Start','End','sum']].to_csv('bla_scatter.txt',index=False,header=False,sep=' ')

(4978, 52)


In [None]:
scatter_df['sum'].describe()

count    3541.000000
mean        1.455910
std         0.941091
min         0.000000
25%         0.698970
50%         1.477121
75%         2.187521
max         3.852785
Name: sum, dtype: float64

In [None]:
plus_cols = [
    'BSF_06_F',
    'BSF_06_RR',
'BSF_07_F',
'BSF_07_RR',
'BSF_08_F',
'BSF_08_RR']

minus_cols = [
'BSF_06_R',
'BSF_06_FR',
'BSF_07_R',
'BSF_07_FR',
'BSF_08_R',
'BSF_08_FR']

bla_df= pd.read_csv('../final_df_G.csv')
print(bla_df.shape)
bla_df['sum'] = bla_df[plus_cols].sum(axis=1)
temp_plus = bla_df[bla_df['orientation']=='+']
temp_minus = bla_df[bla_df['orientation']=='-']

scatter_df = pd.concat([temp_plus,temp_minus])
scatter_df = scatter_df[scatter_df['Chr'].isin(main_chro)]
scatter_df['sum'] = np.log10(scatter_df['sum']+1)
scatter_df[scatter_df['get_region']!='none']
scatter_df[['Chr','Start','End','sum']].to_csv('gla_scatter.txt',index=False,header=False,sep=' ')
scatter_df['sum'].describe()

(8272, 52)


count    5984.000000
mean        1.706137
std         0.804876
min         0.000000
25%         1.197113
50%         1.799341
75%         2.296665
max         3.916138
Name: sum, dtype: float64

## bla_scatter_cds

In [None]:

plus_cols = [
    'BSF_03_F',
    'BSF_03_RR',
'BSF_04_F',
'BSF_04_RR',
'BSF_05_F',
'BSF_05_RR']

control_plus = [
    'BSF_01_F',
    'BSF_01_RR',    
]


minus_cols = [
'BSF_03_R',
'BSF_03_FR',
'BSF_04_R',
'BSF_04_FR',
'BSF_05_R',
'BSF_05_FR']


control_minus = [
    'BSF_01_F',
    'BSF_01_RR',    
]

bla_df= pd.read_csv('../final_df_B.csv')
temp_df = pd.read_csv('final_df_B_only_cds.bed',sep='\t',header=None,index_col=-1)

print(1,bla_df.shape)
bla_df = bla_df[bla_df['Geneid'].isin(temp_df.index.values)]
print(bla_df.shape)

print(2,bla_df.shape)



temp_plus = bla_df[bla_df['orientation']=='+']
temp_plus['fc'] = temp_plus[plus_cols].sum(axis=1)/3
temp_plus['fc'] = temp_plus['fc']
temp_plus['fc'] = temp_plus['fc']/(temp_plus[control_plus].sum(axis=1)+1)
temp_plus['fc']=np.log2(temp_plus['fc'])
temp_plus = temp_plus[temp_plus['fc']>0]


temp_minus = bla_df[bla_df['orientation']=='-']
temp_minus['fc'] = temp_minus[minus_cols].sum(axis=1)/3
temp_minus['fc'] = temp_minus['fc']/(temp_minus[control_minus].sum(axis=1)+1)
temp_minus['fc']=np.log2(temp_minus['fc'])
temp_minus = temp_minus[temp_minus['fc']>0]


scatter_df = pd.concat([temp_plus,temp_minus])
scatter_df = scatter_df[scatter_df['Chr'].isin(main_chro)]
scatter_df[scatter_df['get_region']!='none']

scatter_df = scatter_df.replace(-np.inf,np.nan).dropna()

scatter_df[['Chr','Start','End','fc']].to_csv('bla_scatter_cds.txt',index=False,header=False,sep=' ')
scatter_df['fc'].describe()

1 (4978, 52)
(411, 52)
2 (411, 52)


count    80.000000
mean      1.748196
std       1.378365
min       0.032421
25%       0.614277
50%       1.289507
75%       2.790945
max       5.415037
Name: fc, dtype: float64

In [None]:
plus_cols[0:2]

['BSF_03_F', 'BSF_03_RR']

## bla_scatter_utr

In [None]:

plus_cols = [
    'BSF_03_F',
    'BSF_03_RR',
'BSF_04_F',
'BSF_04_RR',
'BSF_05_F',
'BSF_05_RR']

control_plus = [
    'BSF_01_F',
    'BSF_01_RR',    
]


minus_cols = [
'BSF_03_R',
'BSF_03_FR',
'BSF_04_R',
'BSF_04_FR',
'BSF_05_R',
'BSF_05_FR']


control_minus = [
    'BSF_01_F',
    'BSF_01_RR',    
]

bla_df= pd.read_csv('../final_df_B.csv')
temp_df = pd.read_csv('final_df_B_only_cds.bed',sep='\t',header=None,index_col=-1)

print(1,bla_df.shape)
bla_df = bla_df[~bla_df['Geneid'].isin(temp_df.index.values)]
print(1,bla_df.shape)




temp_plus = bla_df[bla_df['orientation']=='+']
temp_plus['fc'] = temp_plus[plus_cols].sum(axis=1)/3
temp_plus['fc'] = temp_plus['fc']
temp_plus['fc'] = temp_plus['fc']/(temp_plus[control_plus].sum(axis=1)+1)
temp_plus['fc']=np.log2(temp_plus['fc'])
temp_plus = temp_plus[temp_plus['fc']>0]

temp_minus = bla_df[bla_df['orientation']=='-']
temp_minus['fc'] = temp_minus[minus_cols].sum(axis=1)/3
temp_minus['fc'] = temp_minus['fc']/(temp_minus[control_minus].sum(axis=1)+1)
temp_minus['fc']=np.log2(temp_minus['fc'])
temp_minus = temp_minus[temp_minus['fc']>0]

scatter_df = pd.concat([temp_plus,temp_minus])
scatter_df = scatter_df[scatter_df['Chr'].isin(main_chro)]
scatter_df[scatter_df['get_region']!='none']

scatter_df = scatter_df.replace(-np.inf,np.nan).dropna()

scatter_df[['Chr','Start','End','fc']].to_csv('bla_scatter_utr.txt',index=False,header=False,sep=' ')
scatter_df['fc'].describe()

1 (4978, 52)
1 (4567, 52)


count    1827.000000
mean        1.982071
std         1.447791
min         0.006232
25%         0.827699
50%         1.689518
75%         2.807355
max         8.196397
Name: fc, dtype: float64

## gla_scatter_utr

In [None]:


control_plus = [
    'BSF_01_F',
    'BSF_01_RR',    
]


plus_cols = [
    'BSF_06_F',
    'BSF_06_RR',
'BSF_07_F',
'BSF_07_RR',
'BSF_08_F',
'BSF_08_RR']

minus_cols = [
'BSF_06_R',
'BSF_06_FR',
'BSF_07_R',
'BSF_07_FR',
'BSF_08_R',
'BSF_08_FR']


control_minus = [
    'BSF_01_F',
    'BSF_01_RR',    
]

bla_df= pd.read_csv('../final_df_G.csv')
temp_df = pd.read_csv('final_df_G_only_cds.bed',sep='\t',header=None,index_col=-1)

print(1,bla_df.shape)
bla_df = bla_df[~bla_df['Geneid'].isin(temp_df.index.values)]
print(1,bla_df.shape)




temp_plus = bla_df[bla_df['orientation']=='+']
temp_plus['fc'] = temp_plus[plus_cols].sum(axis=1)/3
temp_plus['fc'] = temp_plus['fc']
temp_plus['fc'] = temp_plus['fc']/(temp_plus[control_plus].sum(axis=1)+1)
temp_plus['fc']=np.log2(temp_plus['fc'])
temp_plus = temp_plus[temp_plus['fc']>0]

temp_minus = bla_df[bla_df['orientation']=='-']
temp_minus['fc'] = temp_minus[minus_cols].sum(axis=1)/3
temp_minus['fc'] = temp_minus['fc']/(temp_minus[control_minus].sum(axis=1)+1)
temp_minus['fc']=np.log2(temp_minus['fc'])
temp_minus = temp_minus[temp_minus['fc']>0]

scatter_df = pd.concat([temp_plus,temp_minus])
scatter_df = scatter_df[scatter_df['Chr'].isin(main_chro)]

print(scatter_df.shape)
scatter_df = scatter_df.replace(-np.inf,np.nan).dropna()

scatter_df[['Chr','Start','End','fc']].to_csv('gla_scatter_utr.txt',index=False,header=False,sep=' ')
scatter_df['fc'].describe()

1 (8272, 52)
1 (6331, 52)
(1992, 53)


count    1915.000000
mean        1.326739
std         1.120133
min         0.004140
25%         0.530515
50%         1.056487
75%         1.807355
max         9.046215
Name: fc, dtype: float64

## gla_scatter_cds

In [None]:


control_plus = [
    'BSF_01_F',
    'BSF_01_RR',    
]


plus_cols = [
    'BSF_06_F',
    'BSF_06_RR',
'BSF_07_F',
'BSF_07_RR',
'BSF_08_F',
'BSF_08_RR']

minus_cols = [
'BSF_06_R',
'BSF_06_FR',
'BSF_07_R',
'BSF_07_FR',
'BSF_08_R',
'BSF_08_FR']


control_minus = [
    'BSF_01_F',
    'BSF_01_RR',    
]

bla_df= pd.read_csv('../final_df_G.csv')


temp_df = pd.read_csv('final_df_G_only_cds.bed',sep='\t',header=None,index_col=-1)

print(1,bla_df.shape)
bla_df = bla_df[bla_df['Geneid'].isin(temp_df.index.values)]
print(2,bla_df.shape)



temp_plus = bla_df[bla_df['orientation']=='+']
temp_plus['fc'] = temp_plus[plus_cols].sum(axis=1)/3
temp_plus['fc'] = temp_plus['fc']
temp_plus['fc'] = temp_plus['fc']/(temp_plus[control_plus].sum(axis=1)+1)
temp_plus['fc']=np.log2(temp_plus['fc'])
temp_plus = temp_plus[temp_plus['fc']>0]

temp_minus = bla_df[bla_df['orientation']=='-']
temp_minus['fc'] = temp_minus[minus_cols].sum(axis=1)/3
temp_minus['fc'] = temp_minus['fc']/(temp_minus[control_minus].sum(axis=1)+1)
temp_minus['fc']=np.log2(temp_minus['fc'])
temp_minus = temp_minus[temp_minus['fc']>0]

scatter_df = pd.concat([temp_plus,temp_minus])
scatter_df = scatter_df[scatter_df['Chr'].isin(main_chro)]

print(scatter_df.shape)
scatter_df = scatter_df.replace(-np.inf,np.nan).dropna()

scatter_df[['Chr','Start','End','fc']].to_csv('gla_scatter_cds.txt',index=False,header=False,sep=' ')
scatter_df['fc'].describe()

1 (8272, 52)
2 (1941, 52)
(982, 53)


count    918.000000
mean       1.572313
std        1.326708
min        0.005714
25%        0.615846
50%        1.246714
75%        2.064503
max        8.921841
Name: fc, dtype: float64

In [None]:
infile=os.path.join('TriTrypDB-51_TbruceiTREU927.gff')
#read the GFF file
gff = pd.read_table(infile,header=None,comment='#',
                    names = ['seqname','source',
                             'feature','start',
                             'end','score',
                             'strand','frame','attribute'])
gff = gff[gff['feature']=='CDS']
gff[['seqname','start','end']].to_csv('cds_927.bed',sep='\t',index=False,header=False)

In [None]:
bla_df= pd.read_csv('../final_df_B.csv')
bla_df[['Chr','Start','End','Geneid']].to_csv('final_df_B.bed',sep='\t',index=False,header=False)

In [None]:
bla_df= pd.read_csv('../final_df_G.csv')
bla_df[['Chr','Start','End','Geneid']].to_csv('final_df_G.bed',sep='\t',index=False,header=False)

Unnamed: 0,Geneid,Chr,Start,End,Strand,Length,BSF_01,BSF_06,BSF_07,BSF_08,...,BSF_08_enrich,sum_reads,belongs_to,is_main_chr,desc,CF_median,enrichment_median,get_region,ks_overlap,orient_change
0,NA_peak_20,5K5_v5.1,38453,39045,.,593,969,1312,1048,1622,...,0.592398,3217.0,Tb05.5K5.170,False,elongation of very long chain fatty acids prot...,0.836066,0.550145,5K5_v5.1:38453..39045:f,none,True
1,NA_peak_21,5K5_v5.1,41304,42911,.,1608,2840,9890,8518,9592,...,0.444142,633.0,Tb05.5K5.170,False,elongation of very long chain fatty acids prot...,0.017007,0.284211,5K5_v5.1:41304..42034:f,none,True
2,NA_peak_22,5K5_v5.1,46294,47615,.,1322,928,2218,1336,1558,...,0.597737,3842.0,Tb05.5K5.180,False,"receptor-type adenylate cyclase GRESAG 4, puta...",0.783333,0.643255,5K5_v5.1:47179..47615:f,none,True
3,NA_peak_23,5K5_v5.1,48944,49452,.,509,758,1822,1667,1880,...,0.089494,157.0,Tb05.5K5.190,False,"receptor-type adenylate cyclase GRESAG 4, puta...",0.01227,0.089494,none,none,False
4,NA_peak_24,5K5_v5.1,52835,53476,.,642,449,1065,609,705,...,0.658038,1670.0,Tb05.5K5.190,False,"receptor-type adenylate cyclase GRESAG 4, puta...",0.7,0.658038,none,none,False


## CODE TO RUN WITH CIRCOS ENV

In [None]:
bedops --difference final_df_B.bed cds_927.bed > final_df_B_no_cds.bed
bedops --element-of 100% final_df_B.bed cds_927.bed > final_df_B_only_cds.bed
bedops --element-of 100% final_df_G.bed cds_927.bed > final_df_G_only_cds.bed

/bin/sh: bedops: command not found


In [None]:
sort-bed final_df_B.bed > sorted_final_df_B.bed
sort-bed final_df_G.bed > sorted_final_df_G.bed
sort-bed cds_927.bed > sorted_cds_927.bed

In [None]:
bedops --element-of 100% sorted_final_df_B.bed sorted_cds_927.bed > final_df_B_only_cds.bed
bedops --element-of 100% sorted_final_df_G.bed sorted_cds_927.bed > final_df_G_only_cds.bed

In [None]:
!circos -conf fig_sense.conf -param file=sense

debuggroup summary 0.13s welcome to circos v0.69-8 15 Jun 2019 on Perl 5.032001
debuggroup summary 0.13s current working directory /Users/MTinti/git_projects/motif_dhorn/circos_plot
debuggroup summary 0.13s command /Users/MTinti/miniconda3/envs/work3/bin/circos -conf fig_sense.conf -param file=sense
debuggroup summary 0.13s loading configuration from file fig_sense.conf
debuggroup summary 0.13s found conf file fig_sense.conf
debuggroup summary 0.23s debug will appear for these features: output,summary
debuggroup summary 0.23s bitmap output image ./sense.png
debuggroup summary 0.23s SVG output image ./sense.svg
debuggroup summary 0.23s parsing karyotype and organizing ideograms
debuggroup summary 0.26s karyotype has 11 chromosomes of total size 26,520,491
debuggroup summary 0.26s applying global and local scaling
debuggroup summary 0.27s allocating image, colors and brushes
debuggroup summary 1.35s drawing 11 ideograms of total size 26,520,491
debuggroup summary 1.35s drawing highlights

# Anti-sence

## bla_scatter_cds R

In [None]:

plus_cols = [
    'BSF_03_F',
    'BSF_03_RR',
'BSF_04_F',
'BSF_04_RR',
'BSF_05_F',
'BSF_05_RR']

control_plus = [
    'BSF_01_F',
    'BSF_01_RR',    
]


minus_cols = [
'BSF_03_R',
'BSF_03_FR',
'BSF_04_R',
'BSF_04_FR',
'BSF_05_R',
'BSF_05_FR']


control_minus = [
    'BSF_01_F',
    'BSF_01_RR',    
]

bla_df= pd.read_csv('../final_df_B.csv')
temp_df = pd.read_csv('final_df_B_only_cds.bed',sep='\t',header=None,index_col=-1)


bla_df = bla_df[bla_df['Geneid'].isin(temp_df.index.values)]
print(bla_df.shape)

print(bla_df.shape)



temp_plus = bla_df[bla_df['orientation']=='-']
temp_plus['fc'] = temp_plus[plus_cols].sum(axis=1)/3
temp_plus['fc'] = temp_plus['fc']
temp_plus['fc'] = temp_plus['fc']/(temp_plus[control_plus].sum(axis=1)+1)
temp_plus['fc']=np.log2(temp_plus['fc'])
temp_plus = temp_plus[temp_plus['fc']>0]

temp_minus = bla_df[bla_df['orientation']=='+']
temp_minus['fc'] = temp_minus[minus_cols].sum(axis=1)/3
temp_minus['fc'] = temp_minus['fc']/(temp_minus[control_minus].sum(axis=1)+1)
temp_minus['fc']=np.log2(temp_minus['fc'])
temp_minus = temp_minus[temp_minus['fc']>0]

scatter_df = pd.concat([temp_plus,temp_minus])
scatter_df = scatter_df[scatter_df['Chr'].isin(main_chro)]
scatter_df[scatter_df['get_region']!='none']

scatter_df = scatter_df.replace(-np.inf,np.nan).dropna()
scatter_df['fc']=scatter_df['fc'].clip(0,4)
scatter_df[['Chr','Start','End','fc']].to_csv('bla_scatter_cds_R.txt',index=False,header=False,sep=' ')
scatter_df['fc'].describe()

(411, 52)
(411, 52)


count    48.000000
mean      1.514707
std       1.225799
min       0.028014
25%       0.535155
50%       1.255950
75%       2.380685
max       4.000000
Name: fc, dtype: float64

## bla_scatter_utr R

In [None]:

plus_cols = [
    'BSF_03_F',
    'BSF_03_RR',
'BSF_04_F',
'BSF_04_RR',
'BSF_05_F',
'BSF_05_RR']

control_plus = [
    'BSF_01_F',
    'BSF_01_RR',    
]


minus_cols = [
'BSF_03_R',
'BSF_03_FR',
'BSF_04_R',
'BSF_04_FR',
'BSF_05_R',
'BSF_05_FR']


control_minus = [
    'BSF_01_F',
    'BSF_01_RR',    
]

bla_df= pd.read_csv('../final_df_B.csv')
temp_df = pd.read_csv('final_df_B_only_cds.bed',sep='\t',header=None,index_col=-1)


bla_df = bla_df[~bla_df['Geneid'].isin(temp_df.index.values)]

print(bla_df.shape)



temp_plus = bla_df[bla_df['orientation']=='-']
temp_plus['fc'] = temp_plus[plus_cols].sum(axis=1)/3
temp_plus['fc'] = temp_plus['fc']
temp_plus['fc'] = temp_plus['fc']/(temp_plus[control_plus].sum(axis=1)+1)
temp_plus['fc']=np.log2(temp_plus['fc'])
temp_plus = temp_plus[temp_plus['fc']>0]

temp_minus = bla_df[bla_df['orientation']=='+']
temp_minus['fc'] = temp_minus[minus_cols].sum(axis=1)/3
temp_minus['fc'] = temp_minus['fc']/(temp_minus[control_minus].sum(axis=1)+1)
temp_minus['fc']=np.log2(temp_minus['fc'])
temp_minus = temp_minus[temp_minus['fc']>0]

scatter_df = pd.concat([temp_plus,temp_minus])
scatter_df = scatter_df[scatter_df['Chr'].isin(main_chro)]
scatter_df[scatter_df['get_region']!='none']

scatter_df = scatter_df.replace(-np.inf,np.nan).dropna()
scatter_df['fc']=scatter_df['fc'].clip(0,4)
scatter_df[['Chr','Start','End','fc']].to_csv('bla_scatter_utr_R.txt',index=False,header=False,sep=' ')
scatter_df['fc'].describe()

(4567, 52)


count    891.000000
mean       1.615359
std        1.186219
min        0.004899
25%        0.610308
50%        1.372923
75%        2.433331
max        4.000000
Name: fc, dtype: float64

## gla_scatter_utr R

In [None]:


control_plus = [
    'BSF_01_F',
    'BSF_01_RR',    
]


plus_cols = [
    'BSF_06_F',
    'BSF_06_RR',
'BSF_07_F',
'BSF_07_RR',
'BSF_08_F',
'BSF_08_RR']

minus_cols = [
'BSF_06_R',
'BSF_06_FR',
'BSF_07_R',
'BSF_07_FR',
'BSF_08_R',
'BSF_08_FR']


control_minus = [
    'BSF_01_F',
    'BSF_01_RR',    
]

bla_df= pd.read_csv('../final_df_G.csv')
temp_df = pd.read_csv('final_df_G_only_cds.bed',sep='\t',header=None,index_col=-1)


bla_df = bla_df[~bla_df['Geneid'].isin(temp_df.index.values)]

print(bla_df.shape)



temp_plus = bla_df[bla_df['orientation']=='-']
temp_plus['fc'] = temp_plus[plus_cols].sum(axis=1)/3
temp_plus['fc'] = temp_plus['fc']
temp_plus['fc'] = temp_plus['fc']/(temp_plus[control_plus].sum(axis=1)+1)
temp_plus['fc']=np.log2(temp_plus['fc'])
temp_plus = temp_plus[temp_plus['fc']>0]

temp_minus = bla_df[bla_df['orientation']=='+']
temp_minus['fc'] = temp_minus[minus_cols].sum(axis=1)/3
temp_minus['fc'] = temp_minus['fc']/(temp_minus[control_minus].sum(axis=1)+1)
temp_minus['fc']=np.log2(temp_minus['fc'])
temp_minus = temp_minus[temp_minus['fc']>0]

scatter_df = pd.concat([temp_plus,temp_minus])
scatter_df = scatter_df[scatter_df['Chr'].isin(main_chro)]

print(scatter_df.shape)
scatter_df = scatter_df.replace(-np.inf,np.nan).dropna()
scatter_df['fc']=scatter_df['fc'].clip(0,4)
scatter_df[['Chr','Start','End','fc']].to_csv('gla_scatter_utr_R.txt',index=False,header=False,sep=' ')
scatter_df['fc'].describe()

(6331, 52)
(2730, 53)


count    2566.000000
mean        1.399552
std         1.008188
min         0.005782
25%         0.660072
50%         1.199418
75%         1.874469
max         4.000000
Name: fc, dtype: float64

## gla_scatter_cds R

In [None]:


control_plus = [
    'BSF_01_F',
    'BSF_01_RR',    
]


plus_cols = [
    'BSF_06_F',
    'BSF_06_RR',
'BSF_07_F',
'BSF_07_RR',
'BSF_08_F',
'BSF_08_RR']

minus_cols = [
'BSF_06_R',
'BSF_06_FR',
'BSF_07_R',
'BSF_07_FR',
'BSF_08_R',
'BSF_08_FR']


control_minus = [
    'BSF_01_F',
    'BSF_01_RR',    
]

bla_df= pd.read_csv('../final_df_G.csv')


temp_df = pd.read_csv('final_df_G_only_cds.bed',sep='\t',header=None,index_col=-1)

print(bla_df.shape)
bla_df = bla_df[bla_df['Geneid'].isin(temp_df.index.values)]
print(bla_df.shape)



temp_plus = bla_df[bla_df['orientation']=='-']
temp_plus['fc'] = temp_plus[plus_cols].sum(axis=1)/3
temp_plus['fc'] = temp_plus['fc']
temp_plus['fc'] = temp_plus['fc']/(temp_plus[control_plus].sum(axis=1)+1)
temp_plus['fc']=np.log2(temp_plus['fc'])
temp_plus = temp_plus[temp_plus['fc']>0]

temp_minus = bla_df[bla_df['orientation']=='+']
temp_minus['fc'] = temp_minus[minus_cols].sum(axis=1)/3
temp_minus['fc'] = temp_minus['fc']/(temp_minus[control_minus].sum(axis=1)+1)
temp_minus['fc']=np.log2(temp_minus['fc'])
temp_minus = temp_minus[temp_minus['fc']>0]

scatter_df = pd.concat([temp_plus,temp_minus])
scatter_df = scatter_df[scatter_df['Chr'].isin(main_chro)]

print(scatter_df.shape)
scatter_df = scatter_df.replace(-np.inf,np.nan).dropna()
scatter_df['fc']=scatter_df['fc'].clip(0,4)
scatter_df[['Chr','Start','End','fc']].to_csv('gla_scatter_cds_R.txt',index=False,header=False,sep=' ')
scatter_df['fc'].describe()

(8272, 52)
(1941, 52)
(1016, 53)


count    930.000000
mean       1.507967
std        1.076744
min        0.004212
25%        0.652077
50%        1.267292
75%        2.078392
max        4.000000
Name: fc, dtype: float64

## CODE TO RUN WITH CIRCOS ENV

In [None]:
circos -conf fig_anti_sense.conf -param file=anti_sense

# Gene track

In [None]:
infile=os.path.join('TriTrypDB-51_TbruceiTREU927.gff')
#read the GFF file
gff = pd.read_table(infile,header=None,comment='#',
                    names = ['seqname','source',
                             'feature','start',
                             'end','score',
                             'strand','frame','attribute'])
#keep only gene data
gff=gff[gff.feature=='CDS']
gff=gff[gff['seqname'].isin(main_chro)]
gff[['seqname','start','end']].to_csv('genes.txt',index=False,header=False,sep='\t')

In [None]:
infile=os.path.join('TriTrypDB-51_TbruceiTREU927.gff')
#read the GFF file
gff = pd.read_table(infile,header=None,comment='#',
                    names = ['seqname','source',
                             'feature','start',
                             'end','score',
                             'strand','frame','attribute'])
#keep only gene data
gff=gff[gff.feature=='rRNA']
gff=gff[gff['seqname'].isin(main_chro)]
gff[['seqname','start','end']].to_csv('rRNA.txt',index=False,header=False,sep='\t')

In [None]:
infile=os.path.join('TriTrypDB-51_TbruceiTREU927.gff')
#read the GFF file
gff = pd.read_table(infile,header=None,comment='#',
                    names = ['seqname','source',
                             'feature','start',
                             'end','score',
                             'strand','frame','attribute'])
#keep only gene data
gff=gff[gff.feature=='gene']
gff=gff[gff['seqname'].isin(main_chro)]
gff=gff[gff['attribute'].str.contains('retrotransposon hot spot')]
gff[['seqname','start','end']].to_csv('RHSP.txt',index=False,header=False,sep='\t')

In [None]:
infile=os.path.join('TriTrypDB-51_TbruceiTREU927.gff')
#read the GFF file
gff = pd.read_table(infile,header=None,comment='#',
                    names = ['seqname','source',
                             'feature','start',
                             'end','score',
                             'strand','frame','attribute'])
#keep only gene data
gff=gff[gff.feature=='gene']
gff=gff[gff['seqname'].isin(main_chro)]
gff=gff[gff['attribute'].str.contains('expression site-associated gene')]
gff[['seqname','start','end']].to_csv('ESAG.txt',index=False,header=False,sep='\t')


In [None]:
gff.groupby('seqname').size()

seqname
Tb927_01_v5.1    22
Tb927_02_v5.1     8
Tb927_03_v5.1    20
Tb927_04_v5.1     6
Tb927_05_v5.1    14
Tb927_06_v5.1    13
Tb927_07_v5.1     6
Tb927_08_v5.1     5
Tb927_09_v5.1    36
Tb927_10_v5.1    10
Tb927_11_v5.1    26
dtype: int64

In [None]:
infile=os.path.join('TriTrypDB-51_TbruceiTREU927.gff')
#read the GFF file
gff = pd.read_table(infile,header=None,comment='#',
                    names = ['seqname','source',
                             'feature','start',
                             'end','score',
                             'strand','frame','attribute'])
#keep only gene data
gff=gff[gff.feature=='gene']
gff=gff[gff['seqname'].isin(main_chro)]
gff=gff[gff['attribute'].str.contains('(VSG)')]
gff[['seqname','start','end']].to_csv('VSG.txt',index=False,header=False,sep='\t')


In [None]:
gff.groupby('seqname').size()

seqname
Tb927_01_v5.1      6
Tb927_02_v5.1      4
Tb927_03_v5.1     16
Tb927_04_v5.1     13
Tb927_05_v5.1     17
Tb927_06_v5.1     11
Tb927_07_v5.1      4
Tb927_08_v5.1      6
Tb927_09_v5.1    237
Tb927_10_v5.1     27
Tb927_11_v5.1    136
dtype: int64