In [3]:
import pandas as pd
import os
from pathlib import Path
# import sys, traceback
import subprocess
import json
from decimal import Decimal as D
import requests


# import re
# import Bio.PDB

<span style="Times New Roman; font-size:2em;">**Loading data**</span>



In [0]:
response = requests.get('https://www.ebi.ac.uk/thornton-srv/m-csa/api/residues/?format=json')
pd.DataFrame(response)

In [2]:
curated_csv_data = []
with open('curated_data.csv', 'r') as csv_file:
    for line in csv_file:
        row = line.replace('\n','').split(',')
        curated_csv_data.append(row[:12])

curated_df = pd.DataFrame(curated_csv_data[1:], columns = ['M-CSA ID',
 'Uniprot IDs',
 'PDB ID',
 'EC',
 'residue/reactant/product/cofactor',
 'RESIDUE TYPE',
 'CHAIN ID',
 'RESIDUE NUMBER',
 'function location/name',
 'ROLE',
 'ROLE_TYPE',
 'PARENT ROLE'])

# curated_df

In [0]:
# Find the counts of a column
pd.DataFrame(curated_df['RESIDUE TYPE'].value_counts()).to_csv( 'RESIDUE TYPE count.csv', index=True)

In [4]:
residues_roles_df = pd.read_csv('literature_pdb_residues_roles.csv')
# residues_roles_df

In [8]:
# print(residues_roles_df.info())
# print(residues_roles_df['RESIDUE TYPE'].unique())
# print(residues_roles_df['ROLE_TYPE'].unique())
# print(residues_roles_df['ROLE'].unique())
# print(residues_roles_df['PARENT ROLE'].unique())

ls = len(residues_roles_df['PDB ID'].unique())
ls

951

In [3]:
pdb_directory = 'pdb/\'batch download structures\'/'
result_directory = 'pdb/batch_result/'
csv_directory = 'pdb/batch_csv/'

In [6]:
# Unzipping .gz files
# os.system('gunzip -d *.ent.gz')

# Renaming .ent files as pdb
# os.system('rename.ul pdb ''  *.ent')
# os.system('rename.ul .ent .pdb *.ent')

# checking the files in the given directory
stream = os.popen('ls '+ pdb_directory)
output = stream.read()
stream.close()
list_of_files = output.splitlines()
list_of_files = [file_name.replace('.pdb','') for file_name in list_of_files]
print(len(list_of_files))

949


In [0]:
# checking the difference in the list of enzymes
print(set(ls) - set(list_of_files))
print(set(list_of_files) - set(ls))

In [0]:
# A dictionary to record the problems encountered
error_log = dict()
f = open("error_log.txt", "r")
error_log = json.load(f)
f.close()

# An array to record the tasks completed
completed_tasks = []
f = open("completed_tasks.txt", "r")
for line in f:
    completed_tasks.append(line.replace('\n',''))
f.close()

# An Array to record to tasks remaining
remaining_tasks = list(set(list_of_files) - set(completed_tasks))
remaining_tasks.sort()

<span style="Times New Roman; font-size:2em;">**pKa Calculation**</span>

In [0]:
def pka_calculate(pdb_id):
    # Calculation with propka3
    output = subprocess.run(["propka3", pdb_directory.replace('\'','') + pdb_id + '.pdb'], capture_output=True)
    print(output.stdout.decode("utf-8"))
    f = open("log_sheet.txt", "a")
    f.write(output.stdout.decode("utf-8") + '\n')
    f.close()

    # Leaving a record in case of error
    if output.stderr.decode("utf-8") != '':
        error_log[pdb_id] = output.stderr.decode("utf-8")
        print(json.dumps(error_log, indent=2))
        f = open("error_log.txt", "w")
        f.write(json.dumps(error_log, indent=2))
        f.close()
    else:
        # Move the file if everything worked
        os.system('mv ' + pdb_id + '.pka ' + result_directory[:-1])

In [0]:
# def pka_to_pandas(pdb_id):

#     # Check whether the directory exists
#     file_name = result_directory + pdb_id + '.pka'

#     path_exists = Path(file_name).is_file()

#     if(not path_exists) :
#         print("File path not found:" , file_name)
#         f = open("log_sheet.txt", "a")
#         f.write('File path not found: ' + file_name + '\n')
#         f.close()
#         error_log[pdb_id] = "File path not found:" + file_name

#     else:
#         # Reading the file if it exists
#         # Combine the file content as one string
#         file_text = ''
#         for line in open(file_name, 'r'):
#             file_text += line

#         # Splitting the result file by section
#         section = file_text.split('--------------------------------------------------------------------------------------------------------')
#         result_splitlines = section[1].splitlines()

#         # An array to store the result table
#         results = []
#         for string in result_splitlines[3:]:
# #             splitted_line = string.split()
# #             if len(splitted_line) < 6:
# #                 splitted_line.append('')
# #             results.append(splitted_line)
#         #     The following line is reserved for splitting the table by fixed length
#             results.append([string[0:6].lstrip(),string[6:10].lstrip(),string[10:12].lstrip(),string[12:21].lstrip(),string[21:32].lstrip(),string[32:55].lstrip()])

#         # Loading the array to pd dataframe
#         single_result_df = pd.DataFrame(results, columns = ['RESIDUE TYPE', 'RESIDUE NUMBER', 'CHAIN ID', 'pKa', 'model-pKa', 'ligand atom-type'])

#         # Adding a column indicating the PDB ID
#         single_result_df.insert(0, 'PDB ID', pdb_id)

#         # Write csv file
#         single_result_df.to_csv( csv_directory + pdb_id + '.csv', index=False)

#         # return the results
#         return single_result_df


In [0]:
# Dataframe for storing the results
# results_df = pd.DataFrame()

# Running calculation for all files /Takes a LOOONG TIME (1 hour+)
for pdb_id in remaining_tasks:
    # Logging events
    print('Running:', pdb_id)
    f = open("log_sheet.txt", "a")
    f.write('Running: ' + pdb_id + '\n')
    f.close()

    # Calculate and formatting the output
    pka_calculate(pdb_id)
#     results_df = pd.concat([results_df, pka_to_pandas(pdb_id)])

#     # Logging
    f = open("completed_tasks.txt", "a")
    f.write(pdb_id + '\n')
    f.close()

# pka_calculate('1a79')
# results_df = pd.concat([results_df, pka_to_pandas('1a79')])

<span style="Times New Roman; font-size:2em;">**Extracting results**</span>

In [4]:
def pka_to_csv(pdb_id):
    # Check whether the directory exists
    file_name = result_directory + pdb_id + '.pka'
    path_exists = Path(file_name).is_file()

    if(not path_exists) :
        print("File path not found:" , file_name)
    else:
        # Reading the file if it exists
        # Combine the file content as one string
        file_text = ''
        for line in open(file_name, 'r'):
            file_text += line

        # Splitting the result file by section
        section = file_text.split('--------------------------------------------------------------------------------------------------------')
        result_splitlines = section[1].splitlines()

        # An array to store the result table
        results = []
        for string in result_splitlines[3:]:
            list = [string[0:6].lstrip(),string[6:10].lstrip(),string[10:12].lstrip(),string[12:21].lstrip(),string[21:32].lstrip(), str(D(string[12:21].lstrip())-D(string[21:32].lstrip())), string[32:55].lstrip()]
#             print(list)
            results.append(list)

        # Loading the array to pd dataframe
        single_result_df = pd.DataFrame(results, columns = ['RESIDUE TYPE', 'RESIDUE NUMBER', 'CHAIN ID', 'pKa', 'model-pKa', 'delta-pKa', 'ligand atom-type'])

        # Adding a column indicating the PDB ID
        single_result_df.insert(0, 'PDB ID', pdb_id)

        # Write csv file
        single_result_df.to_csv( csv_directory + pdb_id + '.csv', index=False)



In [0]:
# Extracting results to csv
for pdb_id in list_of_files:
    # Logging events
    print('Running:', pdb_id)
    pka_to_csv(pdb_id)
print('done')

<span style="Times New Roman; font-size:2em;">**Concatenating Results**</span>

In [4]:
# checking the generated files in the csv_directory
stream = os.popen('ls '+ csv_directory)
output = stream.read()
stream.close()
list_of_csv = output.splitlines()
print(len(list_of_csv))
print(list_of_csv[0:9])

921
['12as.csv', '13pk.csv', '1a05.csv', '1a0i.csv', '1a16.csv', '1a26.csv', '1a30.csv', '1a41.csv', '1a4i.csv']


In [5]:
enzymes_without_cofactors = list_of_csv

def remove_enzymes_without_cofactors(row):
#     print(row)
#     return row['residue/reactant/product/cofactor']
    if row['residue/reactant/product/cofactor'] == 'cofactor':
        pdb_file_name = row['PDB ID'] + '.csv'
        if pdb_file_name in enzymes_without_cofactors:
            enzymes_without_cofactors.remove(pdb_file_name)

curated_df.apply(remove_enzymes_without_cofactors, axis=1)

list_of_csv = enzymes_without_cofactors

len(list_of_csv)

404

In [0]:
f = open("without_cofactors.txt", "a")

# Concatanate all generated csvs
for csv_file in list_of_csv:
    print('Running:', csv_file)
    f.write(csv_file.replace('.csv','\n'))

f.close()
print('done')

In [0]:
f = open("results.csv", "a")
# f = open("enzymes_without_cofactors.csv", "a")

# Obtain the header from 1st file and write it
first_file = open(csv_directory + list_of_csv[0] , "r").readlines()
f.write(first_file[0])

# Concatanate all generated csvs
for csv_file in list_of_csv:
    print('Running:', csv_file)
    file = open(csv_directory + csv_file , "r").readlines()
    del file[0]
    for line in file:
        f.write(line)
#     f.write(csv_file.replace('.csv','\n'))

f.close()
print('done')

<span style="Times New Roman; font-size:2em;">**Checking Results**</span>

In [0]:
# Dataframe for storing the results
results_df = pd.read_csv('results.csv')
results_df

In [0]:
# Dataframe for storing the results
results_df = pd.read_csv('enzymes_without_cofactors.csv')
results_df