In [12]:
import requests
import warnings
from re import search
from dataclasses import dataclass
from typing import Optional, Union
from pathlib import Path

In [2]:
def download_aaindices_list(index_type: str, path="") -> tuple[bool, int]:
    assert index_type in ['indices', 'matrices', 'potentials'], "Please enter a valid AAIndex. ('indices', 'matrices', 'potentials')"
    r = requests.get(f"https://www.genome.jp/aaindex/AAindex/list_of_{index_type}")
    if r.ok:
        with open(Path(path).joinpath(f'list_of_{index_type}.txt'), 'w') as f:
            f.write(r.text)
    return r.ok, r.status_code

def download_aaindex(index_number: int, path="") -> None:
    assert index_number in [1,2,3], "Please enter a correct AAindex number. (1, 2, or 3)."
    r = requests.get(f'https://www.genome.jp/ftp/db/community/aaindex/aaindex{index_number}')
    if r.ok:
        with open(Path(path).joinpath(f"aaindex{index_number}.txt"), 'w') as f:
            f.write(r.text)
    else:
        return r.ok, r.status_code

In [3]:
# prob not necessary
def download_aaindex_from_accession(accession: str) -> list[float]:
    aaindex_ordered_residues = "ARNDCQEGHILKMFPSTWYV"
    r = requests.get(f"https://www.genome.jp/entry/aaindex:{accession}")
    if r.ok:
        decoded_content = r.content.decode()
        regex_matching_values = "I/V\n(.*\n.*)\n"
        index_values = search(regex_matching_values, decoded_content).groups()[0].split()
        index_dict = {k:v for k,v in zip(aaindex_ordered_residues, map(float, index_values))}
        return index_dict
        
    else: 
        return r.ok, r.status_code()



In [6]:
def parse_aaindex1_to_dict(path=""):
    with open(Path(path).joinpath("aaindex1.txt"), 'r') as f:
        split_indices = f.read().split('//\n')
        aaindex1_dict = dict()
        for a in split_indices:
            rows = a.splitlines()
            if rows:
                accession = rows[0].split()[1]
                aaindex1_dict[accession] = dict()
        
                readingI = False
                readingC = False
                
                for row in rows:
                    if row[0] in "HDRATJ":
                        ld = row.split(maxsplit=1)
                        aaindex1_dict[accession][ld[0]] = ld[1] if len(ld)==2 else ''
                        
                    elif row[0] == 'C' and not readingC:
                        readingC = True
                        aaindex1_dict[accession]['C'] = dict()
                        c = row.split()[1:]
                        for i in range(0, len(c), 2):
                            aaindex1_dict[accession]['C'][c[i]] = float(c[i+1])
                    
                    elif row[0] != 'C' and row[0] != 'I' and readingC:
                        c = row.split()
                        for i in range(0, len(c), 2):
                            aaindex1_dict[accession]['C'][c[i]] = float(c[i+1])
        
                    elif row[0] == 'I' and not readingI:
                        readingC = False
                        readingI = True
                        aaindex1_dict[accession]['I'] = []
        
                    elif readingI:
                        i = row.split()
                        vals = [float(val) if val!='NA' else None for val in row.split()]
                        aaindex1_dict[accession]['I'] += (vals)
                aaindex1_dict[accession]['has_na'] = None in aaindex1_dict[accession]['I']
    return aaindex1_dict     

def parse_aaindex2_to_dict():
    pass
    
def parse_aaindex3_to_dict():
    pass

@dataclass
class AAindex1: 
    accession: str
    description: str
    pmid: str
    authors: str
    article_title: str
    reference: str
    correlations: dict[str, float]
    index: dict[str, Optional[float]]
    has_na: bool
    amino_acids: str = "ARNDCQEGHILKMFPSTWYV"

    def seq2vector(self, sequence: str) -> list[Optional[float]]:
        return [self.index[aa] for aa in sequence]

def parse_aaindex1_to_AAindex1():
    amino_acids = "ARNDCQEGHILKMFPSTWYV"
    aaindex1_dict = parse_aaindex1_to_dict()
    aaindices = []
    for v in aaindex1_dict.values():
        aaindices.append(AAindex1(accession = v['H'],
                                  description = v['D'],
                                  pmid = v['R'],
                                  authors = v['A'],
                                  article_title = v['T'],
                                  reference = v['R'],
                                  correlations = v['C'],
                                  index = dict(zip(amino_acids, v['I'])),
                                  has_na = v['has_na']))
    return aaindices
    
def parse_aaindex2_to_AAindex2():
    pass
    
def parse_aaindex3_to_AAindex3():
    pass

def get_aaindex(aaindex_objects, accession:str) -> Union[AAindex1, AAindex2, AAindex3]:
    return [aai for aai in aaindex_objects if aai.accession==accession][0]

In [7]:
indices = parse_aaindex1_to_objects()
indices[0].seq2vector('ARND')

[4.35, 4.38, 4.75, 4.76]

In [11]:
ARGP820102 = get_aaindex1(indices, 'ARGP820102')
ARGP820102.description, ARGP820102.index

('Signal sequence helical potential (Argos et al., 1982)',
 {'A': 1.18,
  'R': 0.2,
  'N': 0.23,
  'D': 0.05,
  'C': 1.89,
  'Q': 0.72,
  'E': 0.11,
  'G': 0.49,
  'H': 0.31,
  'I': 1.45,
  'L': 3.23,
  'K': 0.06,
  'M': 2.67,
  'F': 1.96,
  'P': 0.76,
  'S': 0.97,
  'T': 0.84,
  'W': 0.77,
  'Y': 0.39,
  'V': 1.08})