# Visualizing CASP Data

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
def make_training_example(lines):
    """
    Takes lines read from file and created a training example.
    """
    example = {}
    
    example['id'] = lines[1].rstrip()
    example['primary'] = lines[3].rstrip()
    
    pssm = []
    for i in range(5, 26):
        pssm.append([float(x) for x in lines[i].split()])
    example['pssm'] = pssm
    
    tertiary = []
    for i in range(27, 30):
        tertiary.append([float(x) for x in lines[i].split()])
    example['tertiary'] = tertiary
    
    example['mask'] = lines[31].rstrip()
    
    return example

In [3]:
def get_tertiary_matrix(tertiary, i):
    x = tertiary[0][3 * i], tertiary[0][3 * i + 1], tertiary[0][3 * i + 2]
    y = tertiary[1][3 * i], tertiary[1][3 * i + 1], tertiary[1][3 * i + 2]
    z = tertiary[2][3 * i], tertiary[2][3 * i + 1], tertiary[2][3 * i + 2]
    return x, y, z

In [11]:
with open('data/casp11/training_100', 'r') as raw_data:
    lines = []
    flag = False
    for line in raw_data:
        lines.append(line)
        if len(lines) == 33:
            example = make_training_example(lines)
            lines = []
            break

In [24]:
print(len(example['primary']))

119


In [23]:
print(example['id'])

2B6G_2_A


In [32]:
dssp_file = example['id'][:4].lower() + '.dssp'
with open('data/dssp/' + dssp_file, 'r') as raw_dssp:
    secondary = []
    flag = False
    for line in raw_dssp:
        if flag or ('#' in line):
            if not flag:
                flag = True
                continue
        if flag:
            if line[7:10] == '   ':
                continue
            p, s = line[13], line[16]
            if s == ' ':
                s = 'C'
            secondary.append([p, s])

In [33]:
print(secondary)

[['N', 'C'], ['P', 'H'], ['K', 'H'], ['S', 'H'], ['L', 'H'], ['T', 'S'], ['D', 'C'], ['P', 'H'], ['K', 'H'], ['L', 'H'], ['L', 'H'], ['K', 'H'], ['N', 'C'], ['I', 'H'], ['P', 'H'], ['M', 'H'], ['W', 'H'], ['L', 'H'], ['K', 'H'], ['S', 'H'], ['L', 'H'], ['R', 'T'], ['L', 'C'], ['H', 'H'], ['K', 'H'], ['Y', 'H'], ['S', 'H'], ['D', 'H'], ['A', 'H'], ['L', 'H'], ['S', 'T'], ['G', 'T'], ['T', 'S'], ['P', 'C'], ['W', 'H'], ['I', 'H'], ['E', 'H'], ['L', 'H'], ['I', 'T'], ['Y', 'T'], ['L', 'C'], ['D', 'C'], ['D', 'H'], ['E', 'H'], ['T', 'H'], ['L', 'H'], ['E', 'H'], ['K', 'H'], ['K', 'H'], ['G', 'S'], ['V', 'C'], ['L', 'C'], ['A', 'C'], ['L', 'H'], ['G', 'H'], ['A', 'H'], ['R', 'H'], ['R', 'H'], ['K', 'H'], ['L', 'H'], ['L', 'H'], ['K', 'H'], ['A', 'H'], ['F', 'H'], ['G', 'H'], ['I', 'H'], ['V', 'H'], ['I', 'H'], ['D', 'H'], ['Y', 'H'], ['K', 'H'], ['E', 'H'], ['R', 'H'], ['D', 'T'], ['L', 'C'], ['I', 'S'], ['D', 'C'], ['R', 'G'], ['S', 'G'], ['A', 'G'], ['Y', 'C']]


In [34]:
print(example['primary'])

GSNVFNNTITHPNAGPTSATSTSTSSNGNTPLSSNSSMNPKSLTDPKLLKNIPMWLKSLRLHKYSDALSGTPWIELIYLDDETLEKKGVLALGARRKLLKAFGIVIDYKERDLIDRSAY


In [31]:
print(len(example['primary']), len(example['secondary']))

119 81


In [35]:
print(example['mask'])

--------------------------------------+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


In [30]:
with open('data/processed/test.txt', 'w') as out_file:
    out_file.write('id')
    out_file.write(example['id'] + '\n')
    out_file.write('primary')
    out_file.write(example['primary'] + '\n')
    out_file.write('pssm')
    out_file.write(example['pssm'] + '\n') # Needs to go from list of lists of floats to multiline string
    out_file.write('secondary')
    out_file.write(example['secondary'] + '\n')
    out_file.write('tertiary')
    out_file.write(example['tertiary'] + '\n') # Needs to go from list of lists of floats to multiline string
    out_file.write('mask')
    out_file.write(example['mask'] + '\n')

TypeError: can only concatenate list (not "str") to list