# Bioinformatics

**University of Tehran, College of Farabi**

**Professor: Dr. hasan pezeshki**

**Student: A. Nazerpanahi**

## Prepare Environment

In [1]:
%pip install pandas numpy more_itertools

Note: you may need to restart the kernel to use updated packages.


In [2]:
%env PROTEINS_FILE proteins.txt
%env PDB_DIR pdbs/
%env FASTA_DIR fasta/
%env SS_DIR ss/

env: PROTEINS_FILE=proteins.txt
env: PDB_DIR=pdbs/
env: FASTA_DIR=fasta/
env: SS_DIR=ss/


In [3]:
!mkdir -p $FASTA_DIR $SS_DIR $PDB_DIR

## Download PDB files

Navigate to [RCSB](https://rcsb.org) and in `Advance search` part search for proteins with only 1 chain.

![rcsb advance search](images/rcsb_advance_search.png)

Then copy `Entry id`s from the result page into `proteins.txt` file.

![rcsb search result](images/rcsb_search_results.png)

In [4]:
!sed -i 's/\s//g' ${PROTEINS_FILE}
# !rm -rf ${PDB_DIR}/*
!./batch_download.sh -f ${PROTEINS_FILE} -p -o ${PDB_DIR}
!gunzip -k --force ${PDB_DIR}/*.pdb.gz
!ls ${PDB_DIR}

Downloading https://files.rcsb.org/download/3JSN.pdb.gz to pdbs//3JSN.pdb.gz
Downloading https://files.rcsb.org/download/101M.pdb.gz to pdbs//101M.pdb.gz
Downloading https://files.rcsb.org/download/102L.pdb.gz to pdbs//102L.pdb.gz
Downloading https://files.rcsb.org/download/102M.pdb.gz to pdbs//102M.pdb.gz
Downloading https://files.rcsb.org/download/103L.pdb.gz to pdbs//103L.pdb.gz
Downloading https://files.rcsb.org/download/103M.pdb.gz to pdbs//103M.pdb.gz
Downloading https://files.rcsb.org/download/104M.pdb.gz to pdbs//104M.pdb.gz
Downloading https://files.rcsb.org/download/105M.pdb.gz to pdbs//105M.pdb.gz
Downloading https://files.rcsb.org/download/106M.pdb.gz to pdbs//106M.pdb.gz
Downloading https://files.rcsb.org/download/107L.pdb.gz to pdbs//107L.pdb.gz
Downloading https://files.rcsb.org/download/107M.pdb.gz to pdbs//107M.pdb.gz
Downloading https://files.rcsb.org/download/108L.pdb.gz to pdbs//108L.pdb.gz
Downloading https://files.rcsb.org/download/108M.pdb.gz to pdbs//108M.pdb.gz

## Functions to parse PDB files

In [5]:
from typing import *
import os
import itertools
import re
from more_itertools import flatten
from collections import Counter

In [6]:
# https://www.ddbj.nig.ac.jp/ddbj/code-e.html
AMINO_ACIDS = {
    'ALA': 'A',  # Alanine
    'ARG': 'R',  # Arginine
    'ASN': 'N',  # Asparagine
    'ASP': 'D',  # Aspartic acid
    'CYS': 'C',  # Cysteine
    'GLN': 'Q',  # Glutamine
    'GLU': 'E',  # Glutamic acid
    'GLY': 'G',  # Glycine
    'HIS': 'H',  # Histidine
    'ILE': 'I',  # Isoleucine
    'LEU': 'L',  # Leucine
    'LYS': 'K',  # Lysine
    'MET': 'M',  # Methionine
    'PHE': 'F',  # Phenylalanine
    'PRO': 'P',  # Proline
    'PYL': 'O',  # Pyrrolysine
    'SER': 'S',  # Serine
    'SEC': 'U',  # Selenocysteine
    'THR': 'T',  # Threonine
    'TRP': 'W',  # Tryptophan
    'TYR': 'Y',  # Tyrosine
    'VAL': 'V',  # Valine
    'ASX': 'B',  # Aspartic acid or Asparagine
    'GLX': 'Z',  # Glutamic acid or Glutamine
    'XAA': 'X',  # Any amino acid
    'XLE': 'J',  # Leucine or Isoleucine
}

In [7]:
def read_proteins_file(proteins_file_path: str):
    with open(proteins_file_path, 'r') as f:
        proteins = [line.replace('\n', '').split(',') for line in f.readlines()]
    proteins = list(flatten(proteins))
    proteins = list(filter(lambda item: item is not None and len(item.strip()) > 0, proteins))
    return proteins

In [8]:
proteins_file = os.environ['PROTEINS_FILE']
pdb_dir = os.environ['PDB_DIR']
fasta_dir = os.environ['FASTA_DIR']
secondary_structure_dir = os.environ['SS_DIR']

In [9]:
def parse_ss_line(line: str):
    if line.startswith('HELIX '):
        return {
            'record_name': 'HELIX',
            'serial_number': int(line[7:10].strip() or '-1'),
            'id': line[11:14].strip(),
            'initial_residue_name': line[15:18].strip(),
            'initial_chain_id': line[19].strip(),
            'initial_sequence_number': int(line[21:25].strip() or '-1'),
            'initial_residue_insertion_code': line[25].strip(),
            'terminal_residue_name': line[27:30].strip(),
            'terminal_chain_id': line[31].strip(),
            'terminal_sequence_number': int(line[34:37].strip() or '-1'),
            'terminal_residue_insertion_code': line[37].strip(),
            'helix_class': int(line[38:40].strip() or '-1'),
            'comment': line[40:70].strip(),
            'length': int(line[71:76].strip() or '-1'),
        }
    elif line.startswith('SHEET '):
        return {
            'record_name': 'SHEET',
            'serial_number': int(line[7:10].strip() or '-1'),
            'id': line[11:14].strip(),
            'number_of_strands': int(line[14:16].strip() or '-1'),
            'initial_residue_name': line[17:20].strip(),
            'initial_chain_id': line[21].strip(),
            'initial_sequence_number': int(line[22:26]),
            'initial_residue_insertion_code': line[26].strip(),
            'terminal_residue_name': line[28:31].strip(),
            'terminal_chain_id': line[32].strip(),
            'terminal_sequence_number': int(line[33:37].strip() or '-1'),
            'terminal_residue_insertion_code': line[37].strip(),
            'sense': int(line[38:40].strip() or '-1'),
            'current_atom': line[41:45].strip(),
            'current_residue_name': line[45:48].strip(),
            'current_chain_id': line[49].strip(),
            'current_residue_sequence': int(line[50:54].strip() or '-1'),
            'current_insertion_code': line[54].strip(),
            'previous_atom': line[56:60].strip(),
            'previous_residue_name': line[60:63].strip(),
            'previous_chain_id': line[64].strip(),
            'previous_residue_sequence': int(line[65:69].strip() or '-1'),
            'previous_insertion_code': line[69].strip(),
        }
    elif line.startswith('TURN '):
        return {
            'record_name': 'TURN',
            'serial_number': int(line[7:10].strip() or '-1'),
            'id': line[11:14].strip(),
            'initial_residue_name': line[15:18].strip(),
            'initial_chain_id': line[19].strip(),
            'initial_sequence_number': int(line[20:24].strip() or '-1'),
            'initial_residue_insertion_code': line[24].strip(),
            'terminal_residue_name': line[26:29].strip(),
            'terminal_chain_id': line[30].strip(),
            'terminal_sequence_number': int(line[31:35].strip() or '-1'),
            'terminal_residue_insertion_code': line[35].strip(),
            'comment': line[40:70].strip(),
        }

In [10]:
def parse_seqres_line(line: str):
    if line.startswith('SEQRES'):
        return {
            'record_name': 'SEQRES',
            'serial_number': int(line[7:10].strip() or '-1'),
            'chain_id': line[11].strip(),
            'number_of_residues': int(line[13:17].strip() or '-1'),
            'residues': [
                line[i:i+3].strip() for i in range(19, 71, 4)
            ]
        }

In [11]:
def parse_line(line: str):
    if re.match('^(SHEET|HELIX|TURN)', line):
        return parse_ss_line(line)
    elif re.match('^SEQRES', line):
        return parse_seqres_line(line)

In [12]:
def parse_single_chain_pdb_file(pdb_file_path: str):
    with open(pdb_file_path, 'r') as f:
        return list(filter(lambda item: item is not None, map(parse_line, f.readlines())))

In [13]:
def extract_sequence_from_pdb_records(pdb_records: List[dict]):
    seqres_records = list(filter(lambda item: item.get('record_name') == 'SEQRES', pdb_records))
    chains_records = {group: list(group_items) for group, group_items in itertools.groupby(seqres_records, lambda item: (item['chain_id'], int(item['number_of_residues'])))}
    sequences = dict()
    for (chain_id, number_of_residues), records in chains_records.items():
        seq = "".join([
            "".join([
                AMINO_ACIDS.get(residue, "X") 
                for residue in item['residues'] 
                if residue.strip()
            ]) 
            for item in records
        ])
        sequences[chain_id] = (seq, number_of_residues)
    return sequences

In [14]:
def get_secondary_structure_map(pdb_records: List[dict]):
    ss_records = list(filter(lambda item: item.get('record_name') in ['HELIX', 'SHEET', 'TURN'], pdb_records))
    return {
        group: list(group_items) 
        for group, group_items in itertools.groupby(
            ss_records,
            lambda item: (item['record_name'], item['initial_chain_id'], item['terminal_chain_id'])
        )
    }

In [15]:
def get_sequence_secondary_structure_segments(
    secondary_structure_map: Dict[Tuple[str, str, str], List[dict]],
):
    _segments = dict()
    for (structure_type, initial_chain, terminal_chain), seq_parts in secondary_structure_map.items():
        if initial_chain != terminal_chain:
            raise Exception('Unsupported')
        _chain = initial_chain
        if _chain not in _segments:
            _segments[_chain] = set()
        for _seq_part in seq_parts:
            _id = _seq_part['id']
            _initial_seq_num = _seq_part['initial_sequence_number']
            _terminal_seq_num = _seq_part['terminal_sequence_number']
            _item = (structure_type, _id, _initial_seq_num, _terminal_seq_num)
            if _item not in _segments[_chain]:
                _segments[_chain].add(_item)
    return _segments

In [16]:
def get_secondary_structure_masks(
    secondary_structure_segments: Dict[str, Tuple[str, int, int]],
    chain_sequences: Dict[str, Tuple[str, int]],
):
    masks = dict()
    for chain_id, segments in secondary_structure_segments.items():
        lines = []
        _sequence, _ = chain_sequences[chain_id]
        if chain_id not in masks:
            masks[chain_id] = 'C' * len(_sequence)
        for (structure_type, segment_id, seq_start, seq_end) in segments:
            _length = seq_end - seq_start + 1
            masks[chain_id] = masks[chain_id][:seq_start-1] + structure_type[0] * _length + masks[chain_id][seq_end:]
    return masks

In [17]:
def write_fasta(
    residue_id: str,
    chain_sequences: Dict[str, Tuple[str, int]],
    prefix: str = None,
    line_width: int = 60,
):
    prefix = prefix or ""
    for chain_id, (sequence, _) in chain_sequences.items():
        _seq_lines = [sequence[i:i+line_width] + '\n' for i in range(0, len(sequence), line_width)]
        lines = [
            f'>{residue_id}:{chain_id}\n',
            *_seq_lines,
        ]
        with open(prefix + residue_id + '-' + chain_id + '.fasta', 'w') as f:
            f.writelines(lines)

In [18]:
def write_ss_file(
    residue_id: str,
    secondary_structure_segments: Dict[str, Tuple[str, int, int]],
    chain_sequences: Dict[str, Set[Tuple[str, str, int, int]]],
    prefix: str = None,
):
    prefix = prefix or ""
    for chain_id, segments in secondary_structure_segments.items():
        lines = []
        _sequence, _ = chain_sequences[chain_id]
        for (structure_type, segment_id, seq_start, seq_end) in segments:
            lines.append(f"{segment_id}\n")
            lines.append(f"{seq_start}:{seq_end}\n")
            lines.append(f"{_sequence[seq_start-1:seq_end]}\n")
            lines.append(f"{structure_type}\n")
        with open(prefix + residue_id + '-' + chain_id + ".ss", 'w') as f:
            f.writelines(lines)

In [19]:
def read_ss_file(
    residue_id: str,
    chain_id: str,
    prefix: str = None,
):
    prefix = prefix or ""
    with open(prefix + residue_id + '-' + chain_id + ".ss", 'r') as f:
        lines = list(map(lambda item: item.strip(), f.readlines()))
    if len(lines) % 4 != 0:
        raise Exception('Invalid ss file')
    segments = [(lines[i+3], lines[i], int(lines[i+1].split(':')[0]), int(lines[i+1].split(':')[1])) for i in range(0, len(lines), 4)]
    return set(segments)

In [20]:
def count_with_overlap(string: str, substring: str):
    count = 0
    start = 0
    while start < len(string):
        pos = string.find(substring, start)
        if pos != -1:
            start = pos + 1
            count += 1
        else:
            break
    return count

## Debug above functions

In [21]:
# pdb_path = 'pdbs/3JSN.pdb'

In [22]:
# pdb_records = parse_single_chain_pdb_file(pdb_path)
# pdb_records

In [23]:
# chain_sequences = extract_sequence_from_pdb_records(pdb_records)
# chain_sequences

In [24]:
# secondary_structure_map = get_secondary_structure_map(pdb_records)
# secondary_structure_map

In [25]:
# _segments = get_sequence_secondary_structure_segments(secondary_structure_map)
# _segments['A']

In [26]:
# masks = get_secondary_structure_masks(_segments, chain_sequences)
# masks['A']

## Main Application

In [27]:
import pandas as pd

In [28]:
SECONDARY_STRUCTURES = ['H', 'S', 'C']

In [29]:
proteins = read_proteins_file(proteins_file)
summary = dict()

for protein in proteins:
    if not protein.strip():
        continue
    protein_name = protein.upper()
    
    pdb_path = f'{pdb_dir}{protein_name}.pdb'

    pdb_records = parse_single_chain_pdb_file(pdb_path)  # list of dicts, each dict represents a parsed line in pdb file
    chain_sequences = extract_sequence_from_pdb_records(pdb_records)  # chain_id: (sequence, length)
    
    for chain_id, (sequence, length) in chain_sequences.items():
        frequencies = dict(zip(AMINO_ACIDS.values(), [0] * len(AMINO_ACIDS)))
        frequencies.update(dict(Counter(sequence)))
        
        frequencies = dict(map(lambda item: ('AA_' + item[0] + '_frequency', item[1]), frequencies.items()))
        summary[f'{protein}:{chain_id}'] = {
            'protein': protein,
            'chain': chain_id,
            'length': length,
            'sequence': sequence,
            **frequencies
        }
    
    secondary_structure_map = get_secondary_structure_map(pdb_records)  # {(struct_type(HELIX,SHEET,etc),start_chain,end_chain):[...records]}
    sequence_ss_segments = get_sequence_secondary_structure_segments(secondary_structure_map)  # {(struct_type, id, start, end)}
    chain_masks = get_secondary_structure_masks(sequence_ss_segments, chain_sequences)  # chain_id: mask
    
    for chain_id, mask in chain_masks.items():
        summary[f'{protein}:{chain_id}']['ss_mask'] = mask
        
        _sequence = summary[f'{protein}:{chain_id}']['sequence']
        _aa_ss_freq = dict(Counter(zip(_sequence, mask)))
        _aa_ss_freq = dict(map(lambda item: ('AASS_' + ''.join(item[0]) + '_frequency', item[1]), _aa_ss_freq.items()))
        summary[f'{protein}:{chain_id}'].update(_aa_ss_freq)
        
        _ss_frequencies = {
            "SS_" + "".join(item) + "_frequency": count_with_overlap(mask, "".join(item)) 
            for item in itertools.product(SECONDARY_STRUCTURES, repeat=2)
        }
        summary[f'{protein}:{chain_id}'].update(_ss_frequencies)
    
    write_fasta(protein_name, chain_sequences, (fasta_dir.rstrip('/') + '/'))
    write_ss_file(protein_name, sequence_ss_segments, chain_sequences, (secondary_structure_dir.rstrip('/') + '/'))

In [30]:
list(itertools.product(SECONDARY_STRUCTURES, repeat=2))

[('H', 'H'),
 ('H', 'S'),
 ('H', 'C'),
 ('S', 'H'),
 ('S', 'S'),
 ('S', 'C'),
 ('C', 'H'),
 ('C', 'S'),
 ('C', 'C')]

In [31]:
summary_df = pd.DataFrame.from_dict(summary, orient='index').fillna(0)
summary_df

Unnamed: 0,protein,chain,length,sequence,AA_A_frequency,AA_R_frequency,AA_N_frequency,AA_D_frequency,AA_C_frequency,AA_Q_frequency,...,SS_SC_frequency,SS_CH_frequency,SS_CS_frequency,SS_CC_frequency,AASS_HS_frequency,AASS_CC_frequency,AASS_WS_frequency,AASS_XC_frequency,AASS_CH_frequency,AASS_XH_frequency
3JSN:A,3JSN,A,318,MADLSSRVNELHDLLNQYSYEYYVEDNPSVPDSEYDKLLHELIKIE...,18,18,21,26,1,12,...,13,11,12,86,0.0,0.0,0.0,0.0,0.0,0.0
101M:A,101M,A,154,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,17,4,2,6,0,5,...,0,6,0,27,0.0,0.0,0.0,0.0,0.0,0.0
102L:A,102L,A,165,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSE...,17,13,12,10,0,5,...,3,6,3,17,1.0,0.0,0.0,0.0,0.0,0.0
102M:A,102M,A,154,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,18,4,2,6,0,5,...,0,6,0,27,0.0,0.0,0.0,0.0,0.0,0.0
103L:A,103L,A,167,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAK...,16,13,12,11,0,5,...,3,6,3,22,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199L:A,199L,A,164,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,16,13,12,10,0,5,...,2,10,2,37,1.0,0.0,0.0,0.0,0.0,0.0
1A06:A,1A06,A,332,MPGAVEGPRWKQAEDIRDIYDFRDVLGTGAFSEVILAEDKRTQKLV...,26,12,7,28,5,13,...,7,14,5,165,1.0,1.0,0.0,0.0,3.0,0.0
1A0B:A,1A0B,A,125,TTEENSKSEALLDIPMLEQYLELVGPKLITDGLAVFEKMMPGYVSV...,9,2,3,6,0,7,...,0,1,0,124,0.0,0.0,0.0,0.0,0.0,0.0
1A0I:A,1A0I,A,348,VNIKTNPFKAVSFVESAIKKALDNAGYLIAEIKYDGVRGNICVDNT...,18,13,17,21,6,10,...,13,7,15,155,1.0,4.0,4.0,0.0,0.0,0.0


In [32]:
with open('summary.txt', 'w') as f:
    f.writelines([
        f'Total proccessed PDB files: {len(summary_df)}\n',
        f'Proccessed PDB codes: [{", ".join(proteins)}]\n',
        f'mean length of residues: {round(summary_df.length.mean(), 4)}\n',
        f'stddev length of residues: {round(summary_df.length.std(), 4)}\n',
    ])

In [33]:
!cat summary.txt

Total proccessed PDB files: 101
Proccessed PDB codes: [3JSN, 101M, 102L, 102M, 103L, 103M, 104M, 105M, 106M, 107L, 107M, 108L, 108M, 109L, 109M, 110L, 110M, 111L, 111M, 112L, 112M, 113L, 114L, 115L, 118L, 119L, 120L, 121P, 122L, 123L, 125L, 126L, 127L, 128L, 129L, 12CA, 130L, 131L, 132L, 133L, 134L, 135L, 138L, 139L, 140L, 141L, 142L, 143L, 144L, 145L, 146L, 147L, 149L, 151L, 152L, 153L, 155C, 155L, 156L, 157L, 158L, 159L, 160L, 161L, 162L, 163L, 164L, 165L, 166L, 16PK, 16VP, 170L, 171L, 172L, 173L, 177L, 178L, 181L, 182L, 183L, 184L, 185L, 186L, 187L, 188L, 189L, 190L, 1914, 191L, 192L, 193L, 194L, 195L, 196L, 197L, 198L, 199L, 1A06, 1A0B, 1A0I, 1A0K]
mean length of residues: 171.1188
stddev length of residues: 45.9002


In [34]:
with open('AA-SS.txt', 'w') as f:
    f.writelines([
        f'{item_id}\n{item_summary["ss_mask"]}\n{item_summary["sequence"]}\n\n'
        for item_id, item_summary in summary.items()
    ])

In [35]:
!cat AA-SS.txt

3JSN:A
CCHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHCCCHHHHHHHCCCCCCCCSSSCCCCCCCCCSSCHHHHHHHHHHHHHHHHHCCCSSSSSSSCCSSSSSSSSCCSSSSSSSCCCCCSSSCHHHHHHCCCCCCCCCCCCCCSSSSSSSSCHHHHHHHHHHHHHHHHCCCCCHHHHHHHHHHHCCHHHHHHHCCCSSSSSSSCCCCCCCCCHHHHHHHHHHHHHCCCCCCCSSSCHHHHHHHHHHHHHHHCCCCCCCSSSSSSSSCHHHHHHHHHSSCCSSCCSSSSSCCCCCCCCCCC
MADLSSRVNELHDLLNQYSYEYYVEDNPSVPDSEYDKLLHELIKIEEEHPEYKTVDSPTVRVGGEAQASFNKVNHDTPMLSLGNAFNEDDLRKFDQRIREQIGNVEYMCELKIDGLAVSLKYVDGYFVQGLTRGDGTTGEDITENLKTIHAIPLKMKEPLNVEVRGEAYMPRRSFLRLNEEKEKNDEQLFANPRNAAAGSLRQLDSKLTAKRKLSVFIYSVNDFTDFNARSQSEALDELDKLGFTTNKNRARVNNIDGVLEYIEKWTSQRESLPYDIDGIVIKVNDLDQQDEMGFTQKSPRWAIAYKFPAEEHHHHHH

101M:A
CCHHHHHHHHHHHHHHHHCHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCHHHHHHHHHCCCCCHHHHHHHHHHHHHHHHHHHCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCC
MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRVKHLKTEAEMKASEDLKKHGVTVLTALGAILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISEAIIHVLHSRHPGNFGADAQGAMNKALELFRKDIAAKYKELGYQG

102L:A
CHHHHHHHHHHHCSSSSSSC

In [36]:
stats = dict(zip(SECONDARY_STRUCTURES, [dict() for _ in range(len(SECONDARY_STRUCTURES))]))
for amino_acid in AMINO_ACIDS.values():
    for seconday_structure in SECONDARY_STRUCTURES:
        column_name = f'AASS_{amino_acid}{seconday_structure}_frequency'
        if hasattr(summary_df, column_name):
            total_count = getattr(summary_df, column_name).sum()
        else:
            total_count = 0
        
        if amino_acid not in stats[seconday_structure]:
            stats[seconday_structure][amino_acid] = 0
        stats[seconday_structure][amino_acid] = stats[seconday_structure][amino_acid] + total_count
stats

{'H': {'A': 1409.0,
  'R': 827.0,
  'N': 553.0,
  'D': 630.0,
  'C': 56.0,
  'Q': 450.0,
  'E': 741.0,
  'G': 374.0,
  'H': 110.0,
  'I': 603.0,
  'L': 1032.0,
  'K': 745.0,
  'M': 342.0,
  'F': 362.0,
  'P': 207.0,
  'O': 0,
  'S': 354.0,
  'U': 0,
  'T': 456.0,
  'W': 199.0,
  'Y': 208.0,
  'V': 730.0,
  'B': 0,
  'Z': 0,
  'X': 3.0,
  'J': 0},
 'S': {'A': 37.0,
  'R': 67.0,
  'N': 37.0,
  'D': 29.0,
  'C': 14.0,
  'Q': 9.0,
  'E': 20.0,
  'G': 86.0,
  'H': 82.0,
  'I': 164.0,
  'L': 236.0,
  'K': 133.0,
  'M': 13.0,
  'F': 26.0,
  'P': 8.0,
  'O': 0,
  'S': 21.0,
  'U': 0,
  'T': 164.0,
  'W': 13.0,
  'Y': 161.0,
  'V': 49.0,
  'B': 0,
  'Z': 0,
  'X': 0,
  'J': 0},
 'C': {'A': 223.0,
  'R': 285.0,
  'N': 458.0,
  'D': 366,
  'C': 47.0,
  'Q': 86.0,
  'E': 207.0,
  'G': 760,
  'H': 105.0,
  'I': 238.0,
  'L': 400.0,
  'K': 535.0,
  'M': 135.0,
  'F': 156.0,
  'P': 194.0,
  'O': 0,
  'S': 343.0,
  'U': 0,
  'T': 465,
  'W': 101.0,
  'Y': 225.0,
  'V': 178.0,
  'B': 0,
  'Z': 0,
  'X'

In [37]:
aa_ss_stats_df = pd.DataFrame(stats).fillna(0)
aa_ss_stats_df

Unnamed: 0,H,S,C
A,1409.0,37.0,223.0
R,827.0,67.0,285.0
N,553.0,37.0,458.0
D,630.0,29.0,366.0
C,56.0,14.0,47.0
Q,450.0,9.0,86.0
E,741.0,20.0,207.0
G,374.0,86.0,760.0
H,110.0,82.0,105.0
I,603.0,164.0,238.0


In [38]:
aa_ss_stats_df = aa_ss_stats_df.transpose().apply(lambda row: round((row / sum(row)) * 100, 2) if sum(row) != 0 else 0).transpose()

In [39]:
with open('matrices', 'w') as f:
    f.write(aa_ss_stats_df.to_string())

In [40]:
!cat matrices

       H      S      C
A  84.42   2.22  13.36
R  70.14   5.68  24.17
N  52.77   3.53  43.70
D  61.46   2.83  35.71
C  47.86  11.97  40.17
Q  82.57   1.65  15.78
E  76.55   2.07  21.38
G  30.66   7.05  62.30
H  37.04  27.61  35.35
I  60.00  16.32  23.68
L  61.87  14.15  23.98
K  52.72   9.41  37.86
M  69.80   2.65  27.55
F  66.54   4.78  28.68
P  50.61   1.96  47.43
O   0.00   0.00   0.00
S  49.30   2.92  47.77
U   0.00   0.00   0.00
T  42.03  15.12  42.86
W  63.58   4.15  32.27
Y  35.02  27.10  37.88
V  76.28   5.12  18.60
B   0.00   0.00   0.00
Z   0.00   0.00   0.00
X  15.79   0.00  84.21
J   0.00   0.00   0.00

In [41]:
ss_stats = {
    item: getattr(summary_df, "SS_" + "".join(item) + "_frequency").sum()
    for item in itertools.product(SECONDARY_STRUCTURES, repeat=2)
}
ss_stats

{('H', 'H'): 9587,
 ('H', 'S'): 3,
 ('H', 'C'): 898,
 ('S', 'H'): 8,
 ('S', 'S'): 1231,
 ('S', 'C'): 304,
 ('C', 'H'): 894,
 ('C', 'S'): 309,
 ('C', 'C'): 7005}

In [42]:
ss_stats = {
    group_key: {
        _inner_item[0][1]: _inner_item[1]
        for _inner_item in list(group_value)
    }
    for group_key, group_value in itertools.groupby(
        ss_stats.items(),
        lambda item: item[0][0]
    )
}
ss_stats

{'H': {'H': 9587, 'S': 3, 'C': 898},
 'S': {'H': 8, 'S': 1231, 'C': 304},
 'C': {'H': 894, 'S': 309, 'C': 7005}}

In [43]:
ss_stats_df = pd.DataFrame(ss_stats)
ss_stats_df

Unnamed: 0,H,S,C
H,9587,8,894
S,3,1231,309
C,898,304,7005


In [44]:
with open('matrices', 'a') as f:
    f.write('\n\n' + ss_stats_df.to_string())

In [45]:
!cat matrices

       H      S      C
A  84.42   2.22  13.36
R  70.14   5.68  24.17
N  52.77   3.53  43.70
D  61.46   2.83  35.71
C  47.86  11.97  40.17
Q  82.57   1.65  15.78
E  76.55   2.07  21.38
G  30.66   7.05  62.30
H  37.04  27.61  35.35
I  60.00  16.32  23.68
L  61.87  14.15  23.98
K  52.72   9.41  37.86
M  69.80   2.65  27.55
F  66.54   4.78  28.68
P  50.61   1.96  47.43
O   0.00   0.00   0.00
S  49.30   2.92  47.77
U   0.00   0.00   0.00
T  42.03  15.12  42.86
W  63.58   4.15  32.27
Y  35.02  27.10  37.88
V  76.28   5.12  18.60
B   0.00   0.00   0.00
Z   0.00   0.00   0.00
X  15.79   0.00  84.21
J   0.00   0.00   0.00

      H     S     C
H  9587     8   894
S     3  1231   309
C   898   304  7005