## Script to extract the sequences of the genes within the candidate regions

#### The gff file of pearl millet used can be found here: https://www.ebi.ac.uk/ena/browser/view/ERZ15184682 
#### The assembly file can be found here: https://www.ebi.ac.uk/ena/browser/view/GCA_947561735


In [4]:
from Bio.Seq import Seq
from Bio import SeqIO
import os
import argparse

dict_contigs = SeqIO.to_dict(SeqIO.parse("pearl_millet_23DB_ONT_assembly.fasta", 'fasta')) 

In [5]:
gff="pearl_millet_23DB_ONT_assembly_annotation_transfer.gff"

chromosome="chr3"
start=101238580
end=188834466

dico_genes={}
with open(gff, "r") as f:
    for line in f:
        if str(line[0])!="#":
            e=line.split("\t")
            feature=e[2]
            if str(feature)=="gene":
                chrms=e[0]
                posB=e[3]
                posE=e[4]
                ID_search=e[8]
                ID_search_s=ID_search.split("_")
                ID_true=str(ID_search_s[1])+"_"+str(ID_search_s[2])
                if str(ID_true) not in dico_genes and str(chrms) == str(chromosome) and int(posB)>=int(start):
                    if int(posB) <= int(end):
                        dico_genes[ID_true]={}
                        dico_genes[ID_true]["posB"]=posB
                        dico_genes[ID_true]["posE"]=posE

### Generates a file with the ID of the genes within a given region:

In [6]:
with open("candidate_region_chr3_list_of_genes.txt", 'w') as f:
    f.write("chromosome\tstart\tend\tID_genes\n")
    for gene in dico_genes:
        f.write(str(chromosome)+"\t"+str(dico_genes[gene]["posB"])+"\t"+str(dico_genes[gene]["posE"])+"\t"+str(gene)+"\n")

In [8]:
print("Number of genes within the region:",len(dico_genes))

Number of genes within the region: 842


### Extracts the sequences of the genes and writes them in a file: 

#### uses the fasta file with the informations stored in dict_contigs
#### and the start and end positions of the genes found in the file candidate_region_chrX_list_of_genes.txt generated above.

In [11]:
for seq in dict_contigs:
    if str(seq)==str(chromosome):
        sequence_chr=dict_contigs[seq].seq
        
with open("genes_sequences_"+str(chromosome)+"_"+str(start)+"_"+str(end)+".fasta", 'w',) as fR:
    for gene in dico_genes:
        name=str(gene)
        B=int(dico_genes[gene]["posB"])-1
        E=int(dico_genes[gene]["posE"])-1
        seq_gene=sequence_chr[int(B):int(E)]
        fR.write(">"+str(name)+"_"+str(B)+"_"+str(E)+"\n"+str(seq_gene)+"\n")
