# Coevolution Streptococcus-Lactobacilus

## Obtain data from servers

In [1]:
import requests, sys


In [2]:
def load_taxa(scientific_prefix):
    """
    """
    requestURL = "https://www.ebi.ac.uk/proteins/api/taxonomy/name/" + scientific_prefix +\
                "%20?pageNumber=1&pageSize=100&searchType=STARTSWITH&fieldName=SCIENTIFICNAME"

    r = requests.get(requestURL, headers={ "Accept" : "application/json"})

    if not r.ok:
      r.raise_for_status()
      sys.exit()

    jsonBody = json.loads(r.text)
    taxa = []
    names = []
    for taxonomy in jsonBody["taxonomies"]:
        print(taxonomy['taxonomyId'])
        print(taxonomy['scientificName'])
        taxa.append(taxonomy['taxonomyId'])
        names.append(taxonomy['scientificName'])
    return taxa, names

termophilus_taxa,  termophilus_names = load_taxa("Streptococcus thermophilus")
streptococcus_taxa,  streptococcus_names = load_taxa("Streptococcus")


264199
Streptococcus thermophilus (strain ATCC BAA-250 / LMG 18311)
299768
Streptococcus thermophilus (strain CNRZ 1066)
322159
Streptococcus thermophilus (strain ATCC BAA-491 / LMD-9)
767463
Streptococcus thermophilus (strain ND03)
1042404
Streptococcus thermophilus CNCM I-1630
1051074
Streptococcus thermophilus JIM 8232
1073569
Streptococcus thermophilus MTCC 5460
1073570
Streptococcus thermophilus MTCC 5461
1091038
Streptococcus thermophilus DSM 20617
1187956
Streptococcus thermophilus MN-ZLW-002
1263110
Streptococcus thermophilus CAG:236
1268061
Streptococcus thermophilus DGCC 7710
1408178
Streptococcus thermophilus ASCC 1275
1415776
Streptococcus thermophilus TH1435
1423145
Streptococcus thermophilus TH1436
1433288
Streptococcus thermophilus MTH17CL396
1433289
Streptococcus thermophilus M17PTZA496
1435972
Streptococcus thermophilus TH985
1435974
Streptococcus thermophilus TH982
1435981
Streptococcus thermophilus 1F8CT
1436725
Streptococcus thermophilus TH1477
1302
Streptococcus go

In [6]:
def load_proteome(taxids, size=10, protein=["LDH"]):
    """
    """
    taxids_str = ",".join(str(x) for x in taxids)
    protein_str = ",".join(x for x in protein)
    print(taxids_str)
    requestURL = "https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=" + str(size) + "&taxid=" +\
                    taxids_str + "&reviewed=false"
    if protein != []:
        requestURL += "&gene=" + protein_str 
    print(requestURL)
    r = requests.get(requestURL, headers={ "Accept" : "text/x-fasta"})

    if not r.ok:
      r.raise_for_status()
      sys.exit()

    proteome = r.text
    return proteome

termophilus_taxids = termophilus_taxa[0:19]
streptococcus_taxids = streptococcus_taxa[0:19]
print(streptococcus_taxids)
print(termophilus_taxids)

streptococcus_proteome = load_proteome(streptococcus_taxids, -1, protein = ["LDH", "CAS2", "CAS3"])
termophilus_proteome = load_proteome(termophilus_taxids, -1, protein = ["LDH", "CAS2", "CAS3"])


[1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 1313, 1314, 1317, 1318, 1319, 1320, 1324, 1325, 1326]
[264199, 299768, 322159, 767463, 1042404, 1051074, 1073569, 1073570, 1091038, 1187956, 1263110, 1268061, 1408178, 1415776, 1423145, 1433288, 1433289, 1435972, 1435974]
1302,1303,1304,1305,1306,1307,1308,1309,1310,1311,1313,1314,1317,1318,1319,1320,1324,1325,1326
https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=-1&taxid=1302,1303,1304,1305,1306,1307,1308,1309,1310,1311,1313,1314,1317,1318,1319,1320,1324,1325,1326&reviewed=false&gene=LDH,CAS2,CAS3
264199,299768,322159,767463,1042404,1051074,1073569,1073570,1091038,1187956,1263110,1268061,1408178,1415776,1423145,1433288,1433289,1435972,1435974
https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=-1&taxid=264199,299768,322159,767463,1042404,1051074,1073569,1073570,1091038,1187956,1263110,1268061,1408178,1415776,1423145,1433288,1433289,1435972,1435974&reviewed=false&gene=LDH,CAS2,CAS3


In [7]:
import re

def proteome2dict(proteome_fasta):
    """
    Returns a dict with keys protein accession and values the list of fasta format for all taxids
    This is the basis for clustalw alignments and tree generation
    """
    proteome = {}
    key_found = False
    for line in proteome_fasta.splitlines():
        if len(line) > 0:
            if line[0] == ">":
                if key_found:
                    if key in proteome:
                        proteome[key].append(seq)
                    else:
                        proteome[key] = [seq]
                key_found = True            
                search_gene_name = re.search('GN=(\w*)', line)
                key = search_gene_name.group(1).upper()
                #print(key)
                seq = line + '\n'
            elif key_found:
                seq += line + '\n'
    if key_found:
        if key in proteome:
            proteome[key].append(seq)
        else:
            proteome[key] = [seq]
    return proteome

4840


In [14]:
# Phylo tree with clustalw. We need to measure the substitution rate.
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from Bio import Phylo
from io import StringIO
import os
from Bio.Align.Applications import ClustalwCommandline
CLUSTALW = r"./clustalw2"
assert os.path.isfile(CLUSTALW), "Clustal W executable missing"
plt.rcParams["figure.figsize"] = (20,30)
matplotlib.rc('font', size=12)
        
def compute_mean_subst_rate(proteome, verbose=False, show_tree=False):
    """
    """
    clustalw_cline = ClustalwCommandline(CLUSTALW, infile=proteome + ".fasta")
    stdout, stderr = clustalw_cline()
    f = open(proteome + ".dnd", "r")
    s_tree = f.read()
    f.close()
    #print(s_tree)
    branch_len = 0
    num_branches = 0
    search_branch_length = re.findall(':([-.0123456789]*)', s_tree)
    for branch_length in search_branch_length:
        #print(branch_length)
        if branch_length != "0.00000":
            branch_len += float(branch_length)
            num_branches += 1
    if verbose: print(branch_len, num_branches, branch_len/num_branches)
    if show_tree:
        tree = Phylo.read(proteome + ".dnd", "newick")
        Phylo.draw(tree)
    return branch_len/num_branches

def compute_subst_rates(proteome, proteome_name, verbose=False):
    """
    """
    subst_rates = {}
    for protein in proteome.keys():
        if verbose: print(protein)
        protein_sequence = ""
        # Only for proteins with enough sequences to make a tree
        if len(proteome[protein]) >= 3:
            for sequence in proteome[protein]:
                protein_sequence += sequence
            fasta_file_name = proteome_name + "_" + protein
            f = open(fasta_file_name + ".fasta", "w")
            if verbose: print(protein_sequence)
            f.write(protein_sequence)
            f.close()
            mean_subst_rate = compute_mean_subst_rate(fasta_file_name)
            subst_rates[protein] = mean_subst_rate
    return subst_rates


# Proteomes in fasta to dictionnaries
proteome_termophilus = proteome2dict(termophilus_proteome)
proteome_streptococcus = proteome2dict(streptococcus_proteome)

# Compute branch lengths
subst_rates_groups = {}
subst_rates_groups["termophilus"] = compute_subst_rates(proteome_termophilus, "termophilus", False)
subst_rates_groups["streptococcus"] = compute_subst_rates(proteome_streptococcus, "streptococcus", False)
print(subst_rates_groups)

# Compute branch ratios
# Compute mean of branch ratios and standard deviation
# Obtain the most extreme values. These are the proteins that could have been a slowdown or from his initial state

{'termophilus': {'CAS2': 0.10117692307692308, 'LDH': 0.007116666666666667}, 'streptococcus': {'CAS2': 0.05127011363636366, 'LDHA': 0.09308272727272726, 'LDH_2': 0.1056288888888889, 'LDH_1': 0.12399615384615387, 'LDHA_1': 0.14062904761904763, 'LDHA_3': 0.1900181818181818, 'LDH': 0.038460561797752806, 'LDHA_2': 0.13067238095238096, 'LDHD': 0.04888}}


In [73]:
%%bash
#cd /Users/nandoide/Desktop/uni/STRBI.practical
jupyter nbconvert --to=latex --template=~/report.tplx PY_REST.ipynb 1> /dev/null
pdflatex -shell-escape PY_REST 1> /dev/null

[NbConvertApp] Converting notebook PY_REST.ipynb to latex
[NbConvertApp] Support files will be in PY_REST_files/
[NbConvertApp] Making directory PY_REST_files
[NbConvertApp] Making directory PY_REST_files
[NbConvertApp] Writing 50890 bytes to PY_REST.tex
