In [1]:
import pandas as pd
import numpy as np

In [2]:
# genome index file was obtained from 'https://www.ncbi.nlm.nih.gov/genome/152?genome_assembly_id=364693'
gff=pd.read_table('s_enterica_genome.gff',sep='\t',comment='#',header=None)  # comment '#' means ignore lines beginning with these           

In [3]:
pd.set_option('display.max_columns',None)
gff.head(n=2)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,NC_003197.2,RefSeq,region,1,4857450,.,+,.,ID=NC_003197.2:1..4857450;Dbxref=taxon:99287;I...
1,NC_003197.2,RefSeq,gene,190,255,.,+,.,ID=gene-STM0001;Dbxref=GeneID:1251519;Name=thr...


In [4]:
genes=gff[gff[2]=="gene"] # select entries annotated as genes

In [5]:
genes.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
1,NC_003197.2,RefSeq,gene,190,255,.,+,.,ID=gene-STM0001;Dbxref=GeneID:1251519;Name=thr...
3,NC_003197.2,RefSeq,gene,325,2799,.,+,.,ID=gene-STM0002;Dbxref=GeneID:1251520;Name=thr...
6,NC_003197.2,RefSeq,gene,2789,3730,.,+,.,ID=gene-STM0003;Dbxref=GeneID:1251521;Name=thr...
10,NC_003197.2,RefSeq,gene,3722,5020,.,+,.,ID=gene-STM0004;Dbxref=GeneID:1251522;Name=thr...
12,NC_003197.2,RefSeq,gene,5114,5898,.,-,.,ID=gene-STM0005;Dbxref=GeneID:1251523;Name=yaa...


In [6]:
ids=genes.iloc[:,8] # select the column with ID details

In [9]:
ids.head()

1     ID=gene-STM0001;Dbxref=GeneID:1251519;Name=thr...
3     ID=gene-STM0002;Dbxref=GeneID:1251520;Name=thr...
6     ID=gene-STM0003;Dbxref=GeneID:1251521;Name=thr...
10    ID=gene-STM0004;Dbxref=GeneID:1251522;Name=thr...
12    ID=gene-STM0005;Dbxref=GeneID:1251523;Name=yaa...
Name: 8, dtype: object

In [9]:
# function to return gene list
def func1(item):    
    spit=item.split(';')
    return spit[2][5:]

In [10]:
gene_lst=ids.map(func1) # obtaining names of genes from ID details coloumn derived from gff file

In [11]:
gene_lst.head()

1     thrL
3     thrA
6     thrB
10    thrC
12    yaaA
Name: 8, dtype: object

In [13]:
genes=gff[gff[2]=="gene"]

In [14]:
# function to obtain gene IDs from ID details column of gff file
def func2(n):    
    split_n=n.split(';')
    return split_n[0][8:] 

In [15]:
ids=genes.iloc[:,8]

In [16]:
id_lst=ids.map(func2)

In [17]:
id_lst.head()    

1     STM0001
3     STM0002
6     STM0003
10    STM0004
12    STM0005
Name: 8, dtype: object

In [18]:
df=genes.drop([1,5,6,7],axis=1)

In [19]:
pd.set_option('display.max_columns',None)
df.head()

Unnamed: 0,0,2,3,4,8
1,NC_003197.2,gene,190,255,ID=gene-STM0001;Dbxref=GeneID:1251519;Name=thr...
3,NC_003197.2,gene,325,2799,ID=gene-STM0002;Dbxref=GeneID:1251520;Name=thr...
6,NC_003197.2,gene,2789,3730,ID=gene-STM0003;Dbxref=GeneID:1251521;Name=thr...
10,NC_003197.2,gene,3722,5020,ID=gene-STM0004;Dbxref=GeneID:1251522;Name=thr...
12,NC_003197.2,gene,5114,5898,ID=gene-STM0005;Dbxref=GeneID:1251523;Name=yaa...


In [80]:
# the S. enterica genome has a chromosome and a plasmid. these 2 appear with unique IDs
df[0].unique()

array(['NC_003197.2', 'NC_003277.2'], dtype=object)

In [20]:
df[0]=np.where(df[0]=="NC_003197.2","chr1",df[0]) # using np.where to rename chromosome as "chr1"

In [85]:
df.head()

Unnamed: 0,0,2,3,4,8
1,chr1,gene,190,255,ID=gene-STM0001;Dbxref=GeneID:1251519;Name=thr...
3,chr1,gene,325,2799,ID=gene-STM0002;Dbxref=GeneID:1251520;Name=thr...
6,chr1,gene,2789,3730,ID=gene-STM0003;Dbxref=GeneID:1251521;Name=thr...
10,chr1,gene,3722,5020,ID=gene-STM0004;Dbxref=GeneID:1251522;Name=thr...
12,chr1,gene,5114,5898,ID=gene-STM0005;Dbxref=GeneID:1251523;Name=yaa...


In [21]:
df[0]=np.where(df[0]=="NC_003277.2","pSLT",df[0]) # using np.where to rename plasmid as "pSLT"

In [22]:
df[0].unique()

array(['chr1', 'pSLT'], dtype=object)

In [34]:
final_df=pd.merge(df,gene_lst,left_index=True,right_index=True,validate="1:1") # dataframe merged with concise gene list

In [36]:
final_df.head()

Unnamed: 0,0,3,4,8_x,8_y
1,chr1,190,255,ID=gene-STM0001;Dbxref=GeneID:1251519;Name=thr...,thrL
3,chr1,325,2799,ID=gene-STM0002;Dbxref=GeneID:1251520;Name=thr...,thrA
6,chr1,2789,3730,ID=gene-STM0003;Dbxref=GeneID:1251521;Name=thr...,thrB
10,chr1,3722,5020,ID=gene-STM0004;Dbxref=GeneID:1251522;Name=thr...,thrC
12,chr1,5114,5898,ID=gene-STM0005;Dbxref=GeneID:1251523;Name=yaa...,yaaA


In [37]:
final_df.drop(['8_x'],axis=1,inplace=True)
final_df.head()

Unnamed: 0,0,3,4,8_y
1,chr1,190,255,thrL
3,chr1,325,2799,thrA
6,chr1,2789,3730,thrB
10,chr1,3722,5020,thrC
12,chr1,5114,5898,yaaA


In [38]:
final_df.columns=['chrm','start','end','gene' ]


Unnamed: 0,chrm,start,end,gene
1,chr1,190,255,thrL
3,chr1,325,2799,thrA
6,chr1,2789,3730,thrB
10,chr1,3722,5020,thrC
12,chr1,5114,5898,yaaA


In [40]:
final_df[final_df.chrm=="chr1"].head()

Unnamed: 0,chrm,start,end,gene
1,chr1,190,255,thrL
3,chr1,325,2799,thrA
6,chr1,2789,3730,thrB
10,chr1,3722,5020,thrC
12,chr1,5114,5898,yaaA


In [39]:
final_df[final_df.chrm=="pSLT"].head()

Unnamed: 0,chrm,start,end,gene
14164,pSLT,94,378,PSLT001
14166,pSLT,712,1155,PSLT002
14168,pSLT,1334,1600,repC
14170,pSLT,1723,1845,repA3
14172,pSLT,1850,1927,tap


In [42]:
final_df.to_csv("250520_salmonella_genes_position.csv",index=False)

In [26]:
crs_mtx=final_df[["gene","start","end"]]
crs_mtx.head()

Unnamed: 0,gene,start,end
1,thrL,190,255
3,thrA,325,2799
6,thrB,2789,3730
10,thrC,3722,5020
12,yaaA,5114,5898


In [46]:
df_for_circos=df

In [47]:
df_for_circos[0]=np.where(df[0]=="chr1","sal1",df[0]) # using np.where to rename chromosome as "sal1" for CIRCOS annotation

In [48]:
df_for_circos[0]=np.where(df[0]=="pSLT","sal2",df[0]) # using np.where to rename plasmid as "sal2" for CIRCOS annotation

In [49]:
df_for_circos.head()

Unnamed: 0,0,3,4,8
1,sal1,190,255,ID=gene-STM0001;Dbxref=GeneID:1251519;Name=thr...
3,sal1,325,2799,ID=gene-STM0002;Dbxref=GeneID:1251520;Name=thr...
6,sal1,2789,3730,ID=gene-STM0003;Dbxref=GeneID:1251521;Name=thr...
10,sal1,3722,5020,ID=gene-STM0004;Dbxref=GeneID:1251522;Name=thr...
12,sal1,5114,5898,ID=gene-STM0005;Dbxref=GeneID:1251523;Name=yaa...


In [50]:
dfc=pd.merge(df_for_circos,gene_lst,left_index=True,right_index=True,validate="1:1") # dataframe merged with concise gene list

In [63]:
dfc.columns=['chrm','start','end','gene']

In [65]:
circos_gene_list=dfc.drop('gene',axis=1)

In [68]:
circos_gene_list.to_csv('genes_sal.txt',index=False,sep='\t',header=False) # tab seperated file with circos annotation and gene position