In [1]:
%load_ext autoreload
%autoreload 2
from fun import *
os.environ['QT_QPA_PLATFORM']='offscreen'

In [2]:
seq_filename = 'fasta_files/akap5_seq_prot.fa'
msa_filename = 'fasta_files/msa/akap5_seq_align.fa'
AKAP79_model_species = binding_regions()
#AKAP79_model_species.muscle_msa(seq_filename, msa_filename)
# Parse msa and original sequences
akap5_seqs = AKAP79_model_species.parse_fasta_file(seq_filename)
akap5_msa = AKAP79_model_species.parse_fasta_file(msa_filename)

In [3]:
# We will use as a reference the human architecture
ref = 'Homo_sapiens'

### Binding partners:

##### Having the human architecture we will  look at the alignment in the different reference coordinates:

### 2. Calcineurin (PP2B) - mediated by the PIAIIT motif- [ref](http://slim.icr.ac.uk/motifs/calcineurin/index.php?page=overview#pixixt)

In [4]:
# Calcineurin binding region 
base_string_piaiit = 'MEPIAIIITDTE'
start, end = AKAP79_model_species.find_binding_region(base_string_piaiit, ref, akap5_msa)
filename = 'fasta_files/binding_regions/AKAP5_PIAIIT.fa'
regions_piaiit_dict, aln_piaiit = seq_domain_alignment(akap5_msa,
                                         akap5_seqs,
                                         start, end,
                                         filename,
                                         binding_partner = 'PIAIIT')

In [5]:
print(f'The PIAIIT motif is missing in the following model species: {[i for i in akap5_seqs.keys() if i not in regions_piaiit_dict.keys()]}')

alv.view(aln_piaiit)


The PIAIIT motif is missing in the following model species: ['Xenopus_tropicalis']
Ornithorhy [30m[47m[44mM[47m[45mE[47m[43mP[47m[44mI[47m[44mA[47m[44mI[47m[44mI[47m[44mI[47m[42mT[47m[45mD[47m[42mT[47m[45mE[47m[0m
Monodelphi [30m[47m[44mM[47m[45mE[47m[43mP[47m[44mI[47m[44mA[47m[44mI[47m[44mI[47m[44mV[47m[42mT[47m[45mD[47m[42mT[47m[45mE[47m[0m
Mus_muscul [30m[47m[44mM[47m[45mE[47m[43mP[47m[44mI[47m[44mA[47m[44mI[47m[44mI[47m[44mI[47m[42mT[47m[45mD[47m[42mT[47m[45mE[47m[0m
Homo_sapie [30m[47m[44mM[47m[45mE[47m[43mP[47m[44mI[47m[44mA[47m[44mI[47m[44mI[47m[44mI[47m[42mT[47m[45mD[47m[42mT[47m[45mE[47m[0m
Canis_lupu [30m[47m[44mM[47m[45mE[47m[43mP[47m[44mI[47m[44mA[47m[44mI[47m[44mI[47m[44mI[47m[42mT[47m[45mD[47m[42mT[47m[45mE[47m[0m
Bos_taurus [30m[47m[44mM[47m[45mE[47m[43mP[47m[44mI[47m[44mA[47m[44mI[47m[44mI[47m[44mI[47m[42mT[47m[45mD

#### Construct a hmm for identifying *calcineurin* binding regions in our model species

There is not a profile hmm available in Pfam, so as a starting point I selected as seed sequences the ones specified in [ref](http://slim.icr.ac.uk/motifs/calcineurin/index.php?)

In [6]:
hmm_for_piaiit_domain = binding_regions()
msa_filename = 'fasta_files/msa/PIAIIT_seed_msa.fa'
# align the seed sequences - downloaded from Pfam
hmm_for_piaiit_domain.muscle_msa('profile_hmm/seed_files/PIAIIT_seed.fa', msa_filename)
# build the profile hmm
hmm_for_piaiit_domain.build_profile_hmm("profile_hmm/hmm/PIAIIT.hmm", msa_filename)
# search for the binding region
hmm_for_piaiit_domain.search_binding_regions("profile_hmm/PIAIIT.sto","profile_hmm/hmm/PIAIIT.hmm",seq_filename)
piaiit_hmm_hits = read_sto_files("profile_hmm/PIAIIT.sto")

In [7]:
hmm_hits_analysis_df(regions_piaiit_dict, piaiit_hmm_hits)

Unnamed: 0,specie,number of instances,aligned to ref seq in msa
0,Anolis_carolinensis,0,-
1,Gallus_gallus,0,-
2,Taeniopygia_guttata,0,-
3,Ornithorhynchus_anatinus,1,{True: 1}
4,Monodelphis_domestica,1,{True: 1}
5,Mus_musculus,1,{True: 1}
6,Homo_sapiens,1,{True: 1}
7,Canis_lupus,1,{True: 1}
8,Bos_taurus,1,{True: 1}
9,Sus_scrofa,1,{True: 1}


By using all the literature sequences as seed sequences, the profile hmm does not capture any binding regions, not even the reference sequence that is included among them (and which is also identical in other species). However, if I create an hmm profile using the human reference as the only seed sequence, the model succeeds in identifying some of the binding regions. 

### 3. PKA - mediated by the RII binding domain- [Church et al. eLife (2021)](https://elifesciences.org/articles/68164)

In [8]:
# Calcineurin binding region 
base_string_RII_binding = 'LLIETASSLVKNAIQLSIEQL'
start, end = AKAP79_model_species.find_binding_region(base_string_RII_binding, ref, akap5_msa)
filename = 'fasta_files/binding_regions/AKAP5_RII_binding.fa'
regions_dict_RII_binding, aln = seq_domain_alignment(akap5_msa,
                                                     akap5_seqs,
                                                     start, end,
                                                     filename,
                                                     binding_partner = 'RII_binding')

In [9]:
print(f'The RII_binding region is missing in the following model species: {[i for i in akap5_seqs.keys() if i not in regions_dict_RII_binding.keys()]}')
alv.view(aln)

The RII_binding region is missing in the following model species: []
Xenopus_tr [30m[47m[44mL[47m[44mL[47m[44mI[47m[42mT[47m[42mT[47m[44mA[47m[44mA[47m[42mT[47m[44mL[47m[44mV[47m[41mK[47m[41mK[47m[44mV[47m[44mL[47m[42mQ[47m[44mA[47m[42mS[47m[44mI[47m[42mQ[47m[42mQ[47m[44mL[47m[0m
Ornithorhy [30m[47m[44mL[47m[44mL[47m[44mI[47m[45mE[47m[42mT[47m[44mA[47m[42mS[47m[42mS[47m[44mL[47m[44mV[47m[41mK[47m[41mK[47m[44mA[47m[44mI[47m[42mQ[47m[44mL[47m[42mS[47m[44mV[47m[45mE[47m[42mQ[47m[44mL[47m[0m
Monodelphi [30m[47m[44mL[47m[44mL[47m[44mM[47m[45mE[47m[42mT[47m[44mA[47m[42mS[47m[42mS[47m[44mL[47m[44mV[47m[41mK[47m[42mN[47m[44mA[47m[44mI[47m[42mQ[47m[44mL[47m[42mS[47m[44mV[47m[45mE[47m[42mQ[47m[44mL[47m[0m
Mus_muscul [30m[47m[44mL[47m[44mL[47m[44mI[47m[45mE[47m[42mT[47m[44mA[47m[42mS[47m[42mS[47m[44mL[47m[44mV[47m[41mK[47m[42mN[47m[44mA

In [10]:
hmm_for_RII_binding = binding_regions()
msa_filename = 'fasta_files/msa/PF10522_seed_msa.fa'
# align the seed sequences - downloaded from Pfam - manually included the human reference
hmm_for_RII_binding.muscle_msa('profile_hmm/seed_files/PF10522_seed.fa', msa_filename)
# build the profile hmm
hmm_for_RII_binding.build_profile_hmm("profile_hmm/hmm/RII_binding.hmm", msa_filename)
# search for the binding region
hmm_for_RII_binding.search_binding_regions("profile_hmm/RII_binding.sto","profile_hmm/hmm/RII_binding.hmm",seq_filename)
RII_binding_hmm_hits = read_sto_files("profile_hmm/RII_binding.sto")


In [11]:
hmm_hits_analysis_df(regions_dict_RII_binding, RII_binding_hmm_hits)

Unnamed: 0,specie,number of instances,aligned to ref seq in msa
0,Xenopus_tropicalis,0,-
1,Ornithorhynchus_anatinus,1,{True: 1}
2,Monodelphis_domestica,1,{True: 1}
3,Mus_musculus,1,{True: 1}
4,Homo_sapiens,1,{True: 1}
5,Canis_lupus,1,{True: 1}
6,Bos_taurus,1,{True: 1}
7,Sus_scrofa,1,{True: 1}
8,Anolis_carolinensis,1,{True: 1}
9,Gallus_gallus,1,{True: 1}


### 4. PKC binding - [Faux et al. jbc (1997)](https://reader.elsevier.com/reader/sd/pii/S0021925818393281?token=41EEC70C8926EE5A9CD0F774A8DB46B41F4C8DD5470424CC4351BE2FFFE89CE95151AA97ACAC6FC2259E36778809C633&originRegion=us-east-1&originCreation=20220721161621)

In [12]:
# Calcineurin binding region 
base_string_PKC = 'KASMLCFKRRKKAAKALKPKAG'
start, end = AKAP79_model_species.find_binding_region(base_string_PKC, ref, akap5_msa)
filename = 'fasta_files/binding_regions/AKAP5_PKC.fa'
regions_dict_PKC, aln_PKC = seq_domain_alignment(akap5_msa,
                                                 akap5_seqs,
                                                 start, end,
                                                 filename,
                                                 binding_partner = 'PKC')

In [13]:
print(f'The PKC binding region is missing in the following model species: {[i for i in akap5_seqs.keys() if i not in regions_dict_PKC.keys()]}')
alv.view(aln_PKC)


The PKC binding region is missing in the following model species: []
Xenopus_tr [30m[47m[41mK[47m[43mP[47m[44mL[47m[41mK[47m[44mI[47m[44mC[47m[44mF[47m[41mK[47m[41mK[47m[41mR[47m[42mS[47m[41mK[47m[44mA[47m[44mL[47m[41mR[47m[41mK[47m[42mT[47m[42mS[47m[45mD[47m[42mS[47m[42mN[47m[45mD[47m[45mD[47m[46mY[47m[41mK[47m[42mS[47m[45mE[47m[0m
Ornithorhy [30m[47m[41mK[47m[42mS[47m[42mS[47m[44mF[47m[44mL[47m[44mC[47m[44mF[47m[41mK[47m[41mR[47m[41mK[47m[41mK[47m[41mK[47m[44mV[47m[44mV[47m[41mK[47m[44mA[47m[42mS[47m[41mK[47m[44mF[47m[42mS[47m[41mK[47m[45mD[47m[45mD[47m[42mS[47m[45mE[47m[43mP[47m[45mD[47m[0m
Monodelphi [30m[47m[41mK[47m[44mA[47m[42mS[47m[44mM[47m[44mI[47m[44mC[47m[44mF[47m[41mK[47m[41mR[47m[41mR[47m[41mR[47m[41mK[47m[42mS[47m[44mV[47m[41mK[47m[44mA[47m[44mL[47m[41mK[47m[47m-[47m[42mS[47m[41mK[47m[42mN[47m[44mC[47m[42mS[47m

In [14]:
hmm_for_PKC = binding_regions()
msa_filename = 'fasta_files/msa/PKC_binding_seed_msa.fa'
# align the seed sequences - downloaded from Pfam - manually included the human reference
hmm_for_PKC.muscle_msa('profile_hmm/seed_files/PKC_binding_seed.fa', msa_filename)
# build the profile hmm
hmm_for_PKC.build_profile_hmm("profile_hmm/hmm/PKC_binding.hmm", msa_filename)
# search for the binding region
hmm_for_PKC.search_binding_regions("profile_hmm/PKC_binding.sto","profile_hmm/hmm/PKC_binding.hmm",seq_filename)
PKC_binding_hmm_hits = read_sto_files("profile_hmm/PKC_binding.sto")


In [15]:
hmm_hits_analysis_df(regions_dict_PKC, PKC_binding_hmm_hits)

Unnamed: 0,specie,number of instances,aligned to ref seq in msa
0,Xenopus_tropicalis,0,-
1,Anolis_carolinensis,0,-
2,Ornithorhynchus_anatinus,1,{True: 1}
3,Monodelphis_domestica,1,{True: 1}
4,Mus_musculus,1,{True: 1}
5,Homo_sapiens,1,{True: 1}
6,Canis_lupus,1,{True: 1}
7,Bos_taurus,1,{True: 1}
8,Sus_scrofa,1,{True: 1}
9,Gallus_gallus,1,{True: 1}
