In [None]:
from Bio.PDB import *    # create basic analytic functions

def seq_getter(record):  # get sequence data
    seq_combined = []
    seq_in_chains = []
    

     # run parser
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('struct', record)    

     # iterate each model, chain, and residue
     # printing out the sequence for each chain

    i = 1
    for model in structure:
        for chain in model:
            i += 1
            seq = []
            
            for residue in chain:
                
                if residue.resname != "HOH":   # don't include HOH 
                    seq.append(residue.resname) # for list of separate chains
                    seq_combined.append(residue.resname)
                
            seq_in_chains.append(seq)    # for list of separate chains
            
    return seq_combined, seq_in_chains


def check(filename):  # get amount of helix and sheets for pdb file
    f = open(filename)
    helix_count = 0
    sheet_count = 0
    for i in f:
        if "HELIX  " in i:
            helix_count = helix_count + 1
        elif "SHEET   " in i:
            sheet_count = sheet_count +1
            
    return helix_count, sheet_count


In [None]:
# FUNCTION TO GET PROTEIN STRUCTURES

import os, glob, time

def protein_info(directory, total_files):  # inputs: directory, number of files desired
    helices = []
    sheets = []
    sequences = []
    name = []
    length = []
    chains = []

    t0 = time.time()
    counter = 0
    
    for filename in glob.glob(directory):
        f = os.path.join(directory, filename)

        # checking if it is a file
        if os.path.isfile(f):
            counts = check(f)  # getting the number of sheets and helices
            sequence_data = seq_getter(f) # getting the sequence data
            
            helices.append(counts[0])
            sheets.append(counts[1])
            sequences.append(sequence_data[0])
            name.append(filename)
            length.append(len(sequence_data[0]))
            chains.append(len(sequence_data[1]))

            counter = counter + 1
            print(round(counter/total_files*100, 2), "% done", "after", round((time.time()-t0)/60, 1), "minutes")
        
        if counter == total_files:   
            break
                
    return name, length, chains, helices, sheets, sequences

directory = "/Volumes/UCG Hard D./PDB Files/pdb//*//*.ent"

data = protein_info(directory, 50000)

names = data[0]
length = data[1]
chains = data[2]
helices = data[3]
sheets = data[4]
sequences = data[5]


In [None]:
import pandas as pd

df = pd.DataFrame(list(zip(names, sequences, length, chains, helices, sheets)), columns = ["names", "sequences", "length", "chains", "helices", "sheets"])
df.to_csv("proteins_many.csv")   # create csv

df = pd.read_csv("proteins_many.csv")  # read csv
df.head()


In [None]:
################################################################################################################

In [None]:
# FUNCTION TO GET DATA 2 (get proteins less than xxx long)

import os, glob, time

def protein_info(directory, total_files, max_length):  # inputs: directory, total_files, max length of sequence
    
    helices = []
    sheets = []
    sequences = []
    name = []
    length = []
    chains = []
    
    t0 = time.time()
    
    counter = 0
    
    for filename in glob.glob(directory):
        
        f = os.path.join(directory, filename)

        # checking if it is a file and that the sequence length is within the limit
        if os.path.isfile(f) and len(seq_getter(f)[0]) <= max_length:
            
            counts = check(f)  # getting the number of sheets and helices
            sequence_data = seq_getter(f)  # getting the sequence data
            
            helices.append(counts[0])
            sheets.append(counts[1])
            sequences.append(sequence_data[0])
            name.append(filename)
            length.append(len(sequence_data[0]))
            chains.append(len(sequence_data[1]))
            
            counter = counter + 1
            print(round(counter/total_files*100, 2), "% done", "after", round((time.time()-t0)/60, 1), "minutes")
            
        if counter == total_files:   
            break
                     
            
    return name, length, chains, helices, sheets, sequences


directory = "/Volumes/UCG Hard D./PDB Files/pdb//*//*.ent"
data = protein_info(directory, 5000, 1000)

names = data[0]
length = data[1]
chains = data[2]
helices = data[3]
sheets = data[4]
sequences = data[5]


In [None]:
import pandas as pd

df = pd.DataFrame(list(zip(names, sequences, length, chains, helices, sheets)), columns = ["names", "sequences", "length", "chains", "helices", "sheets"])
df.to_csv("proteins_5000.csv")   # create csv
df = pd.read_csv("proteins_5000.csv")  # read csv
df.head()