In [3]:
from analyse import *

## Initialize Objects

In [2]:
ENS_Horse = Ensemble("horse_headers.txt")
ODB = OrthoDB()

# <b>Processing ENSEMBL</b>

### Read horse gene symbols

In [4]:
horse_genes_symbols = ENS_Horse.transcriptome_gene_symbols()

34410 Genes symbols found (34410 uniq) in 42747 reads


### Horse Genes Description

In [5]:
horse_genes_descs = ENS_Horse.transcriptome_gene_description()

37430 Genes description found (37430 uniq) in 42747 reads


# <b>Processing ODB</b>

## ODB Protein IDs

In [6]:
odb_genes_path = "horse_odb10v0_genes.tab"

In [7]:
ortho_prot_ids = ODB.odb_genes_info(path= odb_genes_path, tax_id= 9796, info= "prot_seq_id")

(Tax_ID 9796) 21263 prot_seq_id found (20890 uniq) in 21292 record


## ODB Gene Description

In [8]:
ortho_gene_desc = ODB.odb_genes_info(path= odb_genes_path, tax_id=9796, info= "description")

(Tax_ID 9796) 21292 description found (19360 uniq) in 21292 record


## <b>Intersection between Gene Symbols and prot_ids<b>

In [9]:
horseGenes_with_ortoProtID = set(horse_genes_symbols.values()).intersection(set(ortho_prot_ids.values()))
print ("There are %d common genes between Ensembl Genes and OrthoDB" % (len(horseGenes_with_ortoProtID)))
print ("%.2f%% of Horse Genes Covered by OrthoDB" % ((len(horseGenes_with_ortoProtID) / len(set(horse_genes_symbols.values()))) * 100))

There are 14916 common genes between Ensembl Genes and OrthoDB
94.78% of Horse Genes Covered by OrthoDB


# <b>Analyse by description<b>

In [10]:
from difflib import SequenceMatcher
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [11]:
unmatched_horse_gene_symbols = set(horse_genes_symbols.values()) - set(ortho_prot_ids.values())

### <b>Unmatched Ensemble Transcripts IDs</b>

In [12]:
unmatched_horse_transcripts = set()
for transcript_id, gene_symbol in horse_genes_symbols.items():
    if gene_symbol in unmatched_horse_gene_symbols:
        unmatched_horse_transcripts.add(transcript_id)

### <b>Unmatched OrthoDB OG IDs</b>

In [13]:
unmatched_ortho_ogIds = set()
for k,v in ortho_prot_ids.items():
    if v not in horseGenes_with_ortoProtID:
        unmatched_ortho_ogIds.add(k)

### <b>Get Similarity Matching Results</b>

In [14]:
similars = {}
for tr_id in unmatched_horse_transcripts:
    if tr_id not in horse_genes_descs:
        continue
    ensemble_description = horse_genes_descs[tr_id][:-1] # trim the end space
    for og_id in unmatched_ortho_ogIds:
        odb_desc = ortho_gene_desc[og_id]
        if similar(ensemble_description, odb_desc) > 0.98:
            similars[og_id] = [ensemble_description, odb_desc]
            

In [15]:
print("%d Genes matched by description" % (len(similars)))

135 Genes matched by description
