In [69]:
import os
from Bio.Blast import NCBIWWW

In [152]:
def readData(filepath):
    data = ''
    with open(filepath, 'r') as f:
        data = f.readlines()
    return data

def readGenome(folder):
    files = [
        f'{folder}/' + ele for ele in os.listdir(folder)  
        if '.fna' in ele 
        and 'rna' not in ele
        and 'cds' not in ele
        and 'all' not in ele
    ]
    genome = []
    for file in files:
        genome += readData(file)
    
    return genome

def writeAll(folder, genome):
    with open(f'{folder}/all.fna', 'w') as file:
        for line in genome:
            file.write(line)

for folder in ['human_GCF_000001405.39', 'mouse_GCF_000001635.27', 'elephant_GCF_000001905.1', 'bluewhale_GCF_009873245.2', 'nakedmolerat_GCF_000247695.1']:
    genome = readGenome(folder)
    writeAll(folder, genome)

In [None]:
def openCDS(filepath):
    with open(filepath) as f:
        lines = f.readlines()
        data = []
        gene = {'description':'', 'sequence':''}
        for line in lines:
            if line[0] == '>' and gene['description'] != '':
                data.append(gene)
                gene = {'description':'', 'sequence':''}
            if line[0] == '>' and gene['description'] == '':
                gene['description'] = line[:-1]
            else:
                gene['sequence'] += line[:-1]
        data.append(gene)
        lines = []
    
    return data

human_cds = openCDS('human_GCF_000001405.39/cds_from_genomic.fna')
mouse_cds = openCDS('mouse_GCF_000001635.27/cds_from_genomic.fna')
elephant_cds = openCDS('elephant_GCF_000001905.1/cds_from_genomic.fna')
bluewhale_cds = openCDS('bluewhale_GCF_009873245.2/cds_from_genomic.fna')
nakedmolerat_cds = openCDS('nakedmolerat_GCF_000247695.1/cds_from_genomic.fna')

In [44]:
def finder(query, data):
    results = []
    for ele in data:
        if query in ele['description']:
            results.append(ele)
    return results

In [63]:
finder('[gene=TP53]', human_cds)

[{'description': '>lcl|NC_000017.11_cds_NP_000537.3_90427 [gene=TP53] [db_xref=CCDS:CCDS11118.1,Ensembl:ENSP00000269305.4,GeneID:7157] [protein=cellular tumor antigen p53 isoform a] [protein_id=NP_000537.3] [location=complement(join(7669609..7669690,7670609..7670715,7673535..7673608,7673701..7673837,7674181..7674290,7674859..7674971,7675053..7675236,7675994..7676272,7676382..7676403,7676521..7676594))] [gbkey=CDS]',
  'sequence': 'ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGG

In [162]:
import subprocess
from xml.dom.minidom import parse, parseString

def parseResults(results):
    doc = parseString(results)3
    return len(doc.getElementsByTagName('Hit'))

def blastSeqDuplicates(sequence, genome):
    with open(f'query{genome}.fasta', 'w') as file:
        file.write('> Query\n')
        file.write(sequence)
    results = subprocess.check_output([
        '/usr/local/ncbi/blast/bin/blastn',
        '-db',
        f'databases/{genome}',
        '-query',
        f'/Users/jingluo/GitHub/biopython/query{genome}.fasta',
        '-evalue',
        '0.05',
        '-word_size',
        '28',
        '-outfmt',
        '5'
    ])
    return parseResults(results)

refGenome = 'elephant'
for cds in human_cds:
    description = cds['description']
    sequence = cds['sequence']
    duplicates = blastSeqDuplicates(sequence, refGenome)
    with open(f'{refGenome}.txt', 'a') as file:
        file.write(f'{description}==={duplicates}\n')

KeyboardInterrupt: 

In [163]:
len(human_cds)

123410

In [192]:
cdsData = {}

for cds in human_cds:
    description = cds['description']
    sequence = cds['sequence']
    cdsData[description] = {
        'sequence': sequence
    }

In [193]:
for organism in ['human', 'mouse', 'elephant', 'bluewhale', 'nakedmolerat']:
    data = readData(f'{organism}.txt')
    for row in data:
        description, count = row.split('===')
        cdsData[description][organism] = int(count)

In [194]:
cdsArr = []
for key, val in cdsData.items():
    data = {**val}
    data['description'] = key[1:]
    cdsArr.append(data)

In [195]:
cdsArr = [ele for ele in cdsArr if 'human' in ele and 'mouse' in ele and 'elephant' in ele and 'bluewhale' in ele and 'nakedmolerat' in ele]

In [207]:
def average(arr):
    return sum(arr)/len(arr)

def std(arr):
    avg = average(arr)
    deltas = [(ele - avg) ** 2 for ele in arr]
    return (sum(deltas)/len(arr)) ** 0.5

In [210]:
for row in cdsArr:
    human = row['human']
    mouse = row['mouse']
    elephant = row['elephant']
    bluewhale = row['bluewhale']
    nakedmolerat = row['nakedmolerat']
    
    highCancerGroup = [human, mouse]
    lowCancerGroup = [elephant, bluewhale, nakedmolerat]

    highCancer = average(highCancerGroup)
    lowCancer = average(lowCancerGroup)
    
    highCancer = highCancer / (1 + std(highCancerGroup))
    lowCancer = lowCancer / (1 + std(lowCancerGroup))
    
    highCancer = max(highCancerGroup)
    lowCancer = min(lowCancerGroup)
    
    score = lowCancer - highCancer
    
    row['score'] = score

In [212]:
cdsArr = sorted(cdsArr, key=lambda a: a['score'])

In [217]:
cdsArr[-10:]

[{'sequence': 'ATGCAGGAGGTAAGAAGAGGAGGGTCTGTGATACATCACAAGGAGGAAGAAGGAGAAGTGAGTCCAAGAAAGAAGGAAAGCAGTGTTGTGCTGTTGGATGAGAGTTCAGGACCACCAAGCCAGCTGCTTTGGACCCGCCAGGATACCCAGCTCCCTCAGGAAAGCGCTCTATTACCTGCTCCATATCCTGCCTTCACTAAAGATGGAAGCCAAGGAAACCTGCCGCAAGCAGATATCACACTAATGAGCCAGGCCCAAGATAGCAGGAGCATATTATTTCAGGAGTCAGTGACATTTGAGGATGTAGCTGTGAACTTCACTAACAGGGAGTGGCAGTGTCTGACCTACGCTCAAAGGCATCTCTATAAGGATGTGATGTTGGAAAATTATGGGAACATGGTATCACTTGGATTTCCATTTCCTAAACCTCCTTTAATCTCTCATCTGGAGCGAGAAGTAGACCCCTGTGTGCAGGATCCACAGGACAGGGAGTCCCTAAGCTGCTCCTACCCAGTGTCAGCTGACAAGATGTGGCCTGAGAATGAAAAGGCAAGTTCACAACAAGAGATTTTTGAAAATGGAGAAGCCTACTGGATGAAATTTAACAGTCTCCTAAAAGTTGATTCCCGGGATCCTAAGGTTAGAGAAGTTTGTGTTCAGGATGTCAAATTAGAGAATCAATGGGAAACATCTATAAGGGAGAAACTGAGAGAAGAGAAAGAAGGCTCTGAGGAAGTGACCTGCAAAAAAGGAAAGAACCAGAAAGTGCTTAGTAAAAACTTGAATCCAAACTCAAAACATAGTCAATGTAATAAAGTTCTTATAGCACAGAAACTCCATGAATGTGCCAGGTGTGGCAAAAACTTCAGTTGGCACTCAGATCTAATTCTCCATGAGCAAATTCATTCTGGTGAGAAACCCCATGTGTGTAATGAGTGTGGGAAAGCATTCAAGACCAGAAATCAGCTTTCTATGCACCGGATAA

In [220]:
filtered = [ele for ele in cdsArr if ele['score'] >= 2 and ele['human'] == 1]

In [221]:
filtered

[{'sequence': 'ATGGCAGATGAGGAAGAAGACCCCACGTTTGAGGAAGAAAATGAAGAAATTGGAGGAGGTGCAGAAGGTGGACAGGGTAAAAGAAAGAGACTTTTTTCTAAAGAATTGCGATGTATGATGTATGGCTTTGGGGATGACCAGAATCCTTATACTGAGTCAGTGGATATTCTTGAAGATCTTGTCATAGAGTTTATCACTGAAATGACTCACAAGGCAATGTCAATTGGAAGACAAGGTCGAGTACAAGTTGAAGATATCGTCTTCTTGATTCGAAAGGACCCAAGGAAGTTTGCCAGGGTTAAAGACTTGCTTACTATGAATGAAGAATTGAAACGAGCTAGAAAAGCATTTGATGAAGCAAATTATGGATCTTGA',
  'human': 1,
  'mouse': 1,
  'elephant': 3,
  'bluewhale': 4,
  'nakedmolerat': 5,
  'description': 'lcl|NC_000001.11_cds_NP_005636.1_5476 [gene=TAF13] [db_xref=CCDS:CCDS30788.1,Ensembl:ENSP00000355051.4,GeneID:6884] [protein=transcription initiation factor TFIID subunit 13] [protein_id=NP_005636.1] [location=complement(join(109064523..109064693,109066135..109066232,109074987..109075065,109075921..109075947))] [gbkey=CDS]',
  'score': 2},
 {'sequence': 'ATGTGTGGCATTTGTTGTTCTGTAAACTTTTCTGCTGAGCATTTCAGTCAAGATTTAAAAGAGGACTTACTATATAATCTTAAACAGCGGGGACCCAATAGTAGTAAACAATTGTTAAAGTCTGATGTTAACTACCAGTGTTTATTTT