# Extract secondary structure from PDB files easily and generate spreadsheet

<a rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/"><img alt="Creative Commons Licence" style="border-width:0" src="https://licensebuttons.net/l/by-sa/4.0/88x31.png" title='This work is licensed under a Creative Commons Attribution 4.0 International License.' align="right"/></a>

Author: Dr Antonia Mey   
Email: antonia.mey@ed.ac.uk

In [1]:
# Imports
import pandas as pd
import numpy as np
import glob 
import os
import urllib
from collections import Counter
import urllib
import json

In [5]:
def get_alpha_helix_length_and_location(secondary_struc, min_length=8):
    helix_regions = []
    counter = 0
    curr_helix = []
    indexes = [i for i, x in enumerate(list(secondary_struc)) if x == 'H']
    for i in range(len(indexes)-1):
        difference = indexes[i+1]-indexes[i]
        if difference == 1:
            curr_helix.append(indexes[i])
            if i == len(indexes)-2:
                if len(curr_helix)>=min_length-1:
                    curr_helix.append(indexes[i+1])
                    helix_regions.append(curr_helix)
        else:
            curr_helix.append(indexes[i])
            if len(curr_helix)>=min_length-1:
                helix_regions.append(curr_helix)
            curr_helix = []
    return helix_regions

In [6]:
def count_serines(sequence):
    n_serine = sequence.count('S')
    if n_serine >=2:
        return 1
    else:
        return 0

In [7]:
def filter_helices(helix_index_list, known_things, sequence_length, discard_list= ['bHLH', 'Leucine-zipper'],overlap_threshold =4):
    '''
    Parameters:
    -----------
    helix_index_list : list
        2D list containing arrays of indexes where alpha helixes are
    known_things : dictionary
        extracted information from uniprot IDs around regions, domains and motives
    discard_list : list of Strings
        list that contains the identifiers to discard
    
    Returns:
    --------
    helix_index_list : list
        filtered list with correct helix indexes without the overlap
    Algorithm description:
    
    - generate a boolean array of all False of length of the sequence
    - for each annotation we want to check:
    - Add True between start and end of domains/regions we want to check in the boolean array
    - Then loop over helix list creating a boolean array of length sequence for each Helix section
    - Use logic and to compare regions boolean array with helix boolean array. 
    - If the number of over lap of Trues is larger than theshold x, remove this helix chunch from list and don't write to spreadsheet.
    '''
    
    bool_array =  np.zeros(sequence_length, dtype=bool)
    # alpha_helix_index_list = get_alpha_helix_length_and_location(sec_struc_output, min_length=7)
    helix_list_remove_index = []
    for key in known_things.keys():
        for entry in known_things[key]:
            bool_array =  np.zeros(sequence_length, dtype=bool)
            if entry[-1] in discard_list:
                bool_array[entry[0]-1:entry[1]] = True
            index = 0
            for a in helix_index_list:
                curr_helix =  np.zeros(sequence_length, dtype=bool)
                curr_helix[a] = True
                overalp = (np.sum(np.logical_and(bool_array, curr_helix)))
                if overalp>overlap_threshold:
                    helix_list_remove_index.append(index)
                index = index+1
    unique_idx = np.unique(helix_list_remove_index)
    for index in sorted(unique_idx, reverse=True):
        del helix_index_list[index]
    return helix_index_list

In [70]:
def annotating_helices(helix_index_list, known_things, sequence_length,overlap_threshold =4):
    '''
    Parameters:
    -----------
    helix_index_list : list
        2D list containing arrays of indexes where alpha helixes are
    known_things : dictionary
        extracted information from uniprot IDs around regions, domains and motives
    discard_list : list of Strings
        list that contains the identifiers to discard
    
    Returns:
    --------
    helix_index_list : list
        filtered list with correct helix indexes without the overlap
    Algorithm description:
    
    - generate a boolean array of all False of length of the sequence
    - for each annotation we want to check:
    - Add True between start and end of domains/regions we want to check in the boolean array
    - Then loop over helix list creating a boolean array of length sequence for each Helix section
    - Use logic and to compare regions boolean array with helix boolean array. 
    - If the number of over lap of Trues is larger than theshold x, remove this helix chunch from list and don't write to spreadsheet.
    '''
    annotations = [""]*len(helix_index_list)
    for key in known_things.keys():
        for entry in known_things[key]:
            # print(f'checking entry {entry}')
            bool_array =  np.zeros(sequence_length, dtype=bool)
            if entry[0] is None:
                continue
            elif entry[1] is None:
                continue
            bool_array[entry[0]-1:entry[1]] = True
            index = 0
            
            for a in helix_index_list:
                curr_helix =  np.zeros(sequence_length, dtype=bool)
                curr_helix[a] = True
                overalp = (np.sum(np.logical_and(bool_array, curr_helix)))
                if overalp>overlap_threshold:
                    # print(f'annotation found {entry[-1]}')
                    annotations[index] = annotations[index] +key+':'+entry[-1]+";"
                index = index+1
    return annotations

In [9]:
def get_domainn_region_info(data):
    known_things = {}
    known_things['Domain'] = []
    known_things['Region'] = []
    known_things['Motif'] = []
    #known_things['Helix'] = []
    if 'features' in data.keys():
        for d in data['features']:
            #print(d['type'])
            if d['type'] == 'Domain':
                known_things['Domain'].append([d['location']['start']['value'], d['location']['end']['value'], d['description']])
            elif d['type'] == 'Region':
                known_things['Region'].append([d['location']['start']['value'], d['location']['end']['value'], d['description']])
            elif d['type'] == 'Motif':
                known_things['Motif'].append([d['location']['start']['value'], d['location']['end']['value'], d['description']])
            #elif d['type'] == 'Helix':
            #    known_things['Helix'].append([d['location']['start']['value'], d['location']['end']['value'], d['description']])

            
    return known_things       

In [59]:
def generate_table_rows_for_uniprot_id(uni_prot_id, df, work_dir = 'temp', verbose = False):
    # print('Working on uniprot ID', uni_prot_id)
    sec_struc_output = np.load('dssp_info/'+uni_prot_id+'.npy')
    # print('got secondary structure')
    #print(sec_struc_output)
    
    # Process uniprot file
    f = open('unique_ids/'+uni_prot_id+'.json')
    data = json.load(f)
    f.close()
    
    # print('read json file successfully')
    
    # Some basics:
    # checking keys
    keys = data.keys()
    
    # Primary Accession
    uni_id = 'NA'
    if 'primaryAccession' in keys:
        uni_id = data['primaryAccession'] 
        if verbose:
             print('uni_id',uni_id)
    else: 
        print('Issue: No primary Accession found')
    
    # Genes
    gene_name = 'NA'
    if 'genes' in keys:
        gene_name = data['genes'][0]['geneName']['value']
        if verbose:
            print('gene_name',gene_name)
    else:
        print('Issue: No gene found found')
    
    # getting the nucleotide sequenceID
    ref_seq_id = 'NA'
    if 'uniProtKBCrossReferences' in keys:
        for db in data['uniProtKBCrossReferences']:
            if db['database'] == "RefSeq":
                if '.' in db['properties'][0]['value']:
                    ref_seq_id = db['properties'][0]['value'].split('.')[0]
                else:
                    ref_seq_id = db['properties'][0]['value']
    if verbose:
        print('ref_seq_id',ref_seq_id)
    
    # getting sequence info
    sequence = 'NA'
    seq_length = 0
    if 'sequence' in keys:
        sequence = data['sequence']['value']
        if verbose:
            print('sequence',sequence)
    
        seq_length = data['sequence']['length']

    if seq_length == len(sec_struc_output):

        

        # Now lets get the helices:
        alpha_helix_index_list = get_alpha_helix_length_and_location(sec_struc_output, min_length=8)

        # Get the information of what domains and regions exist in the uniport id file:
        known_things = get_domainn_region_info(data)

        discard_list = ['PUM-HD', 'HEAT', 'ARM', 'bHLH', 'Leucine-zipper']


        # Missing: Filter the alpha_helix_index_list
        filtered_helix_list=filter_helices(alpha_helix_index_list, known_things, seq_length, discard_list=discard_list,overlap_threshold =4)

        # annotate helices:
        annotations = annotating_helices(filtered_helix_list, known_things, seq_length,overlap_threshold =4)

        for i in range(len(filtered_helix_list)):
            firstAA_position_in_HELIDR = filtered_helix_list[i][0]+1
            lastAA_position_in_HELIDR = filtered_helix_list[i][-1]+1
            HELIDR_seq = sequence[filtered_helix_list[i][0]:filtered_helix_list[i][-1]+1]
            down_stream_seq = ''
            up_stream_seq = ''
            if filtered_helix_list[i][0]-10 >= 0 and filtered_helix_list[i][-1]+11 < len(sequence):
                # Note the +2 here does not include the last helix AA. 
                # We have to make sure here that we check the arrays are not out of bounds!
                down_stream_seq = sequence[filtered_helix_list[i][-1]+1:filtered_helix_list[i][-1]+11]
                up_stream_seq = sequence[filtered_helix_list[i][0]-10:filtered_helix_list[i][0]]
            elif alpha_helix_index_list[i][0]-10 <= 0:
                # Do we want a shorter version? 
                down_stream_seq = sequence[filtered_helix_list[i][-1]+1:filtered_helix_list[i][-1]+11]
                up_stream_seq = sequence[0:filtered_helix_list[i][0]]
            elif alpha_helix_index_list[i][-1]+11 > len(sequence):
                down_stream_seq = sequence[filtered_helix_list[i][-1]+2:len(sequence)+1]
                up_stream_seq = sequence[filtered_helix_list[i][0]-10:filtered_helix_list[i][0]]

            # This may need fixing if we have shorter upstream and downstream strings
            if len(down_stream_seq)==10 and len(up_stream_seq)==10: 
                Two_S5P_down = count_serines(down_stream_seq[:5])
                Two_S5P_up = count_serines(up_stream_seq[5:])
                # NOW we assemble the row:
                new_row = [uni_id,gene_name,ref_seq_id,firstAA_position_in_HELIDR,lastAA_position_in_HELIDR,up_stream_seq,HELIDR_seq,
                          down_stream_seq,Two_S5P_up,Two_S5P_down,'','','','','','','','','',annotations[i]]
                df.loc[len(df)] = new_row
            else:
                # We may want to revisit this continue here
                continue
        return df
            
    else:
        print(f'sequence length from structure is {seq_length}, from alpha fold {len(sec_struc_output)}')
        print("there is an incompatibility between the sequence and alpha fold structure")
        print(f'the current uniprotid with the issue is: {uni_prot_id}')
        return df

In [76]:
        
df = pd.DataFrame(columns=['uniprot_id','gene_name','refseq_id','firstAA_position_in_HELIDR','lastAA_position_in_HELIDR','HELIDR_upstream_seq'
                           ,'HELIDR_seq', 'HELIDR_downstream_seq', '2S5P_up', '2S5P_down', '2S5P1_up','2S5P1_down','2S5P1_helix','HEK293T_expressed','NonTMD[3]_TMD[2]_SEC[1]',
                           'Non_TMD_classification','4 compartments','TG_CY','TG_SR_nonS', 'annotation'])


In [69]:

#uniprot_ids = ['B7HIJ2', 'F5WVJ2', 'C4LF00', 'O68891']
uniprot_ids= ['P35573']
for ids in uniprot_ids:
    df = generate_table_rows_for_uniprot_id(ids,df)

checking entry [1, None, '4-alpha-glucanotransferase']
checking entry [None, 1532, 'Amylo-1,6-glucosidase']


In [39]:
df

Unnamed: 0,uniprot_id,gene_name,refseq_id,firstAA_position_in_HELIDR,lastAA_position_in_HELIDR,HELIDR_upstream_seq,HELIDR_seq,HELIDR_downstream_seq,2S5P_up,2S5P_down,2S5P1_up,2S5P1_down,2S5P1_helix,HEK293T_expressed,NonTMD[3]_TMD[2]_SEC[1],Non_TMD_classification,4 compartments,TG_CY,TG_SR_nonS,annotation
0,Q96PB7,OLFM3,XM_017000240,81,133,APEQNLCSRD,AKSRQLRQLLEKVQNMSQSIEVLNLRTQRDFQYVLKMETQMKGLKA...,DRKTLMTKHF,0,0,,,,,,,,,,
1,Q96PB7,OLFM3,XM_017000240,140,151,QIEDDRKTLM,TKHFQELKEKMD,ELLPLIPVLE,0,0,,,,,,,,,,
2,Q96PB7,OLFM3,XM_017000240,156,192,LKEKMDELLP,LIPVLEQYKTDAKLITQFKEEIRNLSAVLTGIQEEIG,AYDYEELHQR,0,0,,,,,,,,,,
3,Q96PB7,OLFM3,XM_017000240,196,216,GIQEEIGAYD,YEELHQRVLSLETRLRDCMKK,LTCGKLMKIT,0,0,,,,,,,,,,
4,O14543,SOCS3,NM_003955,28,44,TSLRLKTFSS,KSEYQLVVNAVRKLQES,GFYWSAVTGG,1,0,,,,,,,,,,Region:Kinase inhibitory region (KIR);Region:E...
5,O14543,SOCS3,NM_003955,53,61,ESGFYWSAVT,GGEANLLLS,AEPAGTFLIR,0,0,,,,,,,,,,Domain:SH2;
6,O14543,SOCS3,NM_003955,120,127,STQPVPRFDC,VLKLVHHY,MPPPGAPSFP,0,0,,,,,,,,,,Domain:SH2;
7,O14543,SOCS3,NM_003955,189,200,SRPLSSNVAT,LQHLCRKTVNGH,LDSYEKVTQL,0,0,,,,,,,,,,Domain:SOCS box;


In [15]:
df.to_csv('test.csv')

## Testing things out

In [72]:
f = open('unique_ids_from_spreadsheet.txt', 'r')
f_content = f.readlines()
f.close()

In [73]:
ids = []
for f in f_content:
    ids.append(f.strip())

In [77]:
counter = 0
for i in ids[8708:]:
    if counter%500 == 0:
        print(f'Working on ID: {i} this is entry {counter}/{len(ids)}')
    df = generate_table_rows_for_uniprot_id(i,df)
    counter = counter+1

Working on ID: Q9BQT9 this is entry 0/16236
sequence length from structure is 748, from alpha fold 808
there is an incompatibility between the sequence and alpha fold structure
the current uniprotid with the issue is: Q03518
sequence length from structure is 237, from alpha fold 421
there is an incompatibility between the sequence and alpha fold structure
the current uniprotid with the issue is: Q66PJ3
Working on ID: Q8N6T3 this is entry 500/16236
sequence length from structure is 416, from alpha fold 411
there is an incompatibility between the sequence and alpha fold structure
the current uniprotid with the issue is: Q8TAX9
sequence length from structure is 597, from alpha fold 598
there is an incompatibility between the sequence and alpha fold structure
the current uniprotid with the issue is: Q8N423
sequence length from structure is 785, from alpha fold 812
there is an incompatibility between the sequence and alpha fold structure
the current uniprotid with the issue is: O94901
Worki

In [78]:
df.to_csv('second_part.csv')

In [35]:
df

Unnamed: 0,uniprot_id,gene_name,refseq_id,firstAA_position_in_HELIDR,lastAA_position_in_HELIDR,HELIDR_upstream_seq,HELIDR_seq,HELIDR_downstream_seq,2S5P_up,2S5P_down,2S5P1_up,2S5P1_down,2S5P1_helix,HEK293T_expressed,NonTMD[3]_TMD[2]_SEC[1],Non_TMD_classification,4 compartments,TG_CY,TG_SR_nonS,annotation
0,O00305,CACNB4,NM_001320722,61,89,SADSYTSRPS,DSDVSLEEDREAIRQEREQQAAIQLERAK,SKPVAFAVKT,1,0,,,,,,,,,,Region:Disordered;
1,O00305,CACNB4,NM_001320722,154,169,EGCEIGFIPS,PLRLENIRIQQEQKRG,RFHGGKSSGN,0,0,,,,,,,,,,Domain:SH3;
2,O00305,CACNB4,NM_001320722,233,249,VLVGPSLKGY,EVTDMMQKALFDFLKHR,FDGRISITRV,0,0,,,,,,,,,,
3,O00305,CACNB4,NM_001320722,274,280,SLAKRSVLNN,PSKRAII,ERSNTRSSLA,0,0,,,,,,,,,,
4,O00305,CACNB4,NM_001320722,289,302,IIERSNTRSS,LAEVQSEIERIFEL,ARSLQLVVLD,1,0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8229,Q96PB7,OLFM3,XM_017000240,196,216,GIQEEIGAYD,YEELHQRVLSLETRLRDCMKK,LTCGKLMKIT,0,0,,,,,,,,,,
8230,O14543,SOCS3,NM_003955,28,44,TSLRLKTFSS,KSEYQLVVNAVRKLQES,GFYWSAVTGG,1,0,,,,,,,,,,Region:Kinase inhibitory region (KIR);Region:E...
8231,O14543,SOCS3,NM_003955,53,61,ESGFYWSAVT,GGEANLLLS,AEPAGTFLIR,0,0,,,,,,,,,,Domain:SH2;
8232,O14543,SOCS3,NM_003955,120,127,STQPVPRFDC,VLKLVHHY,MPPPGAPSFP,0,0,,,,,,,,,,Domain:SH2;


## Playground testing things out

In [84]:
uni_prot_id = 'Q9H4S2'
work_dir = 'temp'
print('Working on uniprot ID', uni_prot_id)
# Downloading uniprot file and alphafold file
download_uniprot_json_file(uni_prot_id, work_dir)
download_alpha_fold_pdbs([uni_prot_id], workdir =work_dir)
p = 'temp/AF-'+uni_prot_id+'-F1-model_v2.pdb'
sec_struc_output = get_sec_struct(p, '/Users/toni_brain/miniconda3/envs/dssp//bin/mkdssp')
print('got secondary structure')
print(sec_struc_output)

# Process uniprot file
f = open('temp/'+uni_prot_id+'.json')
data = json.load(f)
f.close()

print('read json file successfully')

Working on uniprot ID Q9H4S2
At entry 0/1
ID: Q9H4S2
got secondary structure
['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' 'T' 'T' '-' '-' 'T' 'T' 'S'
 '-' '-' '-' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'T' 'T' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' 'T' 'T' 'S' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' '-' 'S' 'S' '-' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'T' 'T' '-' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'T' 'T' 'T' 'T' 'T' 'T' '-' '-' 'S' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-

In [85]:
sec_struc_output

array(['-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', 'T', 'T', '-', '-', 'T',
       'T', 'S', '-', '-', '-', '-', 'H', 'H', 'H', 'H', 'H', 'H', 'T',
       'T', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', 'T', 'T',
       'S', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', '-',
       'S', 'S', '-', '-', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H

In [86]:
# This is collecting known information

known_things = {}
known_things['Domain'] = []
known_things['Region'] = []
known_things['Motif'] = []
#known_things['Helix'] = []

for d in data['features']:
    #print(d['type'])
    if d['type'] == 'Domain':
        known_things['Domain'].append([d['location']['start']['value'], d['location']['end']['value'], d['description']])
    elif d['type'] == 'Region':
        known_things['Region'].append([d['location']['start']['value'], d['location']['end']['value'], d['description']])
    elif d['type'] == 'Motif':
        known_things['Motif'].append([d['location']['start']['value'], d['location']['end']['value'], d['description']])
    #elif d['type'] == 'Helix':
    #    known_things['Helix'].append([d['location']['start']['value'], d['location']['end']['value'], d['description']])

In [87]:
known_things

{'Domain': [],
 'Region': [[1, 20, 'SNAG domain'], [201, 264, 'Disordered']],
 'Motif': []}

In [88]:
for d in data['features']:
    if d['type']=='Domain':
        print (d)
    if d['type']=='Region':
        print (d)

{'type': 'Region', 'location': {'start': {'value': 1, 'modifier': 'EXACT'}, 'end': {'value': 20, 'modifier': 'EXACT'}}, 'description': 'SNAG domain', 'evidences': [{'evidenceCode': 'ECO:0000250'}]}
{'type': 'Region', 'location': {'start': {'value': 201, 'modifier': 'EXACT'}, 'end': {'value': 264, 'modifier': 'EXACT'}}, 'description': 'Disordered', 'evidences': [{'evidenceCode': 'ECO:0000256', 'source': 'SAM', 'id': 'MobiDB-lite'}]}


In [18]:
sequence_length = data['sequence']['length']
if sequence_length == len(sec_struc_output):
    print('yes')
    bool_array =  np.zeros(sequence_length, dtype=bool)

yes


In [45]:
bool_array =  np.zeros(sequence_length, dtype=bool)
discard_list = ['bHLH', 'Leucine-zipper']
alpha_helix_index_list = get_alpha_helix_length_and_location(sec_struc_output, min_length=7)
helix_list_remove_index = []
for key in known_things.keys():
    for entry in known_things[key]:
        if entry[-1] in discard_list:
            bool_array[entry[0]-1:entry[1]] = True
            index = 0
            for a in alpha_helix_index_list:
                curr_helix =  np.zeros(sequence_length, dtype=bool)
                curr_helix[a] = True
                overalp = (np.sum(np.logical_and(bool_array, curr_helix)))
                if overalp>4:
                    helix_list_remove_index.append(index)
                index = index+1
unique_idx = np.unique(helix_list_remove_index)
for index in sorted(unique_idx, reverse=True):
    del alpha_helix_index_list[index]
print (alpha_helix_index_list)

[[94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106], [121, 122, 123, 124, 125, 126, 127, 128, 129, 130], [139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153], [175, 176, 177, 178, 179, 180, 181, 182, 183]]


array([4, 5])

In [None]:
for index in sorted(helix_list_remove_index, reverse=True):
    del my_list[index]

In [41]:
known_things

{'Domain': [[354, 406, 'bHLH']],
 'Region': [[204, 295, 'Disordered'], [413, 434, 'Leucine-zipper']],
 'Motif': [[100, 108, '9aaTAD']]}

In [40]:
alpha_helix_index_list

[[94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106],
 [121, 122, 123, 124, 125, 126, 127, 128, 129, 130],
 [139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153],
 [175, 176, 177, 178, 179, 180, 181, 182, 183],
 [347,
  348,
  349,
  350,
  351,
  352,
  353,
  354,
  355,
  356,
  357,
  358,
  359,
  360,
  361,
  362,
  363,
  364,
  365,
  366,
  367,
  368,
  369,
  370,
  371,
  372,
  373,
  374,
  375,
  376,
  377],
 [391,
  392,
  393,
  394,
  395,
  396,
  397,
  398,
  399,
  400,
  401,
  402,
  403,
  404,
  405,
  406,
  407,
  408,
  409,
  410,
  411,
  412,
  413,
  414,
  415,
  416,
  417,
  418,
  419,
  420,
  421,
  422,
  423,
  424,
  425,
  426,
  427,
  428,
  429,
  430,
  431,
  432,
  433,
  434,
  435,
  436]]

In [13]:
df

Unnamed: 0,uniprot_id,gene_name,refseq_id,firstAA_position_in_HELIDR,lastAA_position_in_HELIDR,HELIDR_upstream_seq,HELIDR_seq,HELIDR_downstream_seq,2S5P_up,2S5P_down,2S5P1_up,2S5P1_down,2S5P1_helix,HEK293T_expressed,NonTMD[3]_TMD[2]_SEC[1],Non_TMD_classification,4 compartments,TG_CY,TG_SR_nonS
0,Q8TD16,BICD2,NM_015250,332,414,KEGLAPPSPS,LVSDLLSELNISEIQKLKQQLMQMEREKAGLLATLQDTQKQLEHTR...,DRDSHEDGDY,1,0,,,,,,,,,
1,Q8TD16,BICD2,NM_015250,432,544,GDYYEVDING,PEILACKYHVAVAEAGELREQLKALRSTHEAREAQHAEEKGRYEAE...,NNETPNRVML,0,0,,,,,,,,,
2,Q8TD16,BICD2,NM_015250,552,559,CMCNNETPNR,VMLDYYRE,GQGGAGRTSP,0,0,,,,,,,,,
3,Q8TD16,BICD2,NM_015250,627,656,LSDPRREPMN,IYNLIAIIRDQIKHLQAAVDRTTELSRQRI,ASQELGPAVD,0,0,,,,,,,,,
4,Q8TD16,BICD2,NM_015250,668,811,SQELGPAVDK,DKEALMEEILKLKSLLSTKREQITTLRTVLKANKQTAEVALANLKS...,KAAPKTKPAT,0,0,,,,,,,,,
5,Q9NQ89,C12orf4,NM_020374,40,50,VPLKFPVQEN,ASHLHGRLMLL,HSLPCFIEKD,0,0,,,,,,,,,
6,Q9NQ89,C12orf4,NM_020374,58,89,MLLHSLPCFI,EKDLKEALTQFIEEESLSDYDRDAEASLAAVK,SGEVDLHQLA,0,0,,,,,,,,,
7,Q9NQ89,C12orf4,NM_020374,95,106,LAAVKSGEVD,LHQLASTWAKAY,AETTLEHARP,0,0,,,,,,,,,
8,Q9NQ89,C12orf4,NM_020374,121,133,LEHARPEEPS,WDEDFADVYHDLI,HSPASETLLN,0,1,,,,,,,,,
9,Q9NQ89,C12orf4,NM_020374,138,180,VYHDLIHSPA,SETLLNLEHNYFVSISELIGERDVELKKLRERQGIEMEKVMQE,LGKSLTDQDV,0,0,,,,,,,,,


In [52]:
df.to_csv('test.csv')