<a href="https://colab.research.google.com/github/philpham8/FindRepresentativeRNAChains/blob/main/FindRepresentativeRNAChains_Phillip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Developed by Phillip Pham on 05/2021

Search non-redundant RNAs on Protein Data Base using these criteria:

*   Variable length of nucleotides (min and max cutoffs)
*   No branched polymers (carbohydrates or oligosaccharide)
*   Resolution cutoff in Angstroms (as determined by URL on rna.BGSU.edu

Also contains the function to lookup ligands and generate a XLSX spreadsheet with information on chain length, title, and ligand nonpolymers (intentionally excluding ions).

In [None]:
import requests
import pandas as pd

explicit_ions = ['K', 'MG', 'NA', 'SR', 'BA', 'MN', 'CD', 'TL', 'SO4', 'IRI', 'ACT']

def json_from_url(url):
  response = requests.get(url, timeout=5)

  if response.status_code == requests.codes.ok: return response.json()
  else: return

# Fetch PubMed data with requested PDB ID on PDB.
def get_pdb_pubmed(pdb_id):
  request_url = "https://data.rcsb.org/rest/v1/core/pubmed"
  return json_from_url(request_url + '/' + pdb_id)

# Fetch Nonpolymer data with requested Nonpolymer ID on PDB.
def get_pdb_nonpolymer(pdb_id, nonpolymer_id):
  request_url = 'https://data.rcsb.org/rest/v1/core/nonpolymer_entity'
  return json_from_url(request_url + '/' + pdb_id + '/' + nonpolymer_id)

# Fetch Chemical Composition data with requested Comp ID on PDB.
def get_pdb_chemcomp(comp_id):
  request_url = 'https://data.rcsb.org/rest/v1/core/chemcomp'
  return json_from_url(request_url + '/' + comp_id)

# Fetch information on requested PDB entry
def get_pdb_entry(pdb_id):
  request_url = "https://data.rcsb.org/rest/v1/core/entry"
  return json_from_url(request_url + '/' + pdb_id)

# Fetch information on requested PDB entity/chain (e.g. individual RNA)
def get_pdb_entity(pdb_id, chain_id):
  request_url = "https://data.rcsb.org/rest/v1/core/polymer_entity"
  return json_from_url(request_url + '/' + pdb_id + '/' + chain_id)

# Fetch abstract title from pubmed
def pdb_id_contains_excluded_words(pdb_id, excluded_words):
    pubmed_json = get_pdb_pubmed(pdb_id)
    if pubmed_json is not None and 'rcsb_pubmed_abstract_text' in pubmed_json:
      abstract = pubmed_json['rcsb_pubmed_abstract_text'].lower()
      return any(excluded_word.lower() in abstract for excluded_word in excluded_words)
    else: return False

# Checks that the polymer is of type RNA and DOES NOT contain DNA, oligosaccharide, or protein.
def contains_rna_only(entry):
    return entry['rcsb_entry_info']['polymer_composition'] == 'RNA' and entry['rcsb_entry_info']['na_polymer_entity_types'] == 'RNA (only)'

# Checks if it contains any nonpolymer (either ligands or ions)
# TODO: Ions should be permitted while ligands should not be. Temp action is excluding both.
def contains_nonpolymer(entry):
    return entry['rcsb_entry_info']['nonpolymer_entity_count'] > 0

# Retrieve chemical name from CompID (e.g water)
def get_name_from_comp_id(comp_id):
    return get_pdb_chemcomp(comp_id)['chem_comp']['name']

# Retrieve list of Comp ID from chain (e.g OHO)
def comp_id_from_pdb_and_nonpolymer_id(pdb_id, nonpolymer_id):
    return get_pdb_nonpolymer(pdb_id, nonpolymer_id)['pdbx_entity_nonpoly']['comp_id']

# Retrieve chemical name from CompID (e.g water)
def comp_name_from_pdb_and_nonpolymer_id(pdb_id, nonpolymer_id):
    return get_pdb_nonpolymer(pdb_id, nonpolymer_id)['pdbx_entity_nonpoly']['name']

# Retrieve list of chain ids from entry.
def get_list_of_chain_id(entry):
    return entry['rcsb_entry_container_identifiers']['polymer_entity_ids']

# Retrieve list of chain ids from entry.
def get_entry_title(entry):
    return entry['struct']['title']

# Retrieve list of nonpolymer ids from entry.
def get_list_of_nonpolymer_id(entry):
    entry_identifiers = entry['rcsb_entry_container_identifiers']
    if 'non_polymer_entity_ids' in entry_identifiers: return entry_identifiers['non_polymer_entity_ids']
    else: return

# Exclude ions and return list of ligands:
def nonpolymers_with_no_ions(list_of_nonpolymer_id):
    list_of_nonpolymers = []
    for nonpolymer_id in list_of_nonpolymer_id:
          comp_id = comp_id_from_pdb_and_nonpolymer_id(pdb_id, nonpolymer_id)

          if comp_id not in list_of_nonpolymers and comp_id not in explicit_ions:
            list_of_nonpolymers.append(comp_id)
    return list_of_nonpolymers

# Determine polymer length given entity.
def get_polymer_length(entity):
    return entity['entity_poly']['rcsb_sample_sequence_length']

# Input direct URL (from BGSU), define nucleotides cutoffs, and specify exclusion words

In [None]:
# Ask URL to CSV and then create dataframe from .csv
# E.g. http://rna.bgsu.edu/rna3dhub/nrlist/download/3.180/3.0A/csv
csv_url = input('Please provide direct URL to desired CSV file found on rna.bgsu.edu: ')
data = pd.read_csv(csv_url, usecols=[1])

Please provide direct URL to desired CSV file found on rna.bgsu.edu: http://rna.bgsu.edu/rna3dhub/nrlist/download/3.180/3.0A/csv
What should the maximium number (non-inclusive) of ribonucleotides be? 400
What should the minimum number (non-inclusive) of ribonucleotides be? 95
List any words that you want to blacklist in the PubMed abstract. Separate words by space: tRNA


In [None]:
# Ask user for maximum nucleotides
nt_max_cutoff = int(input("What should the maximium number (non-inclusive) of ribonucleotides be? "))

# Ask user for minimum nucleotides
nt_min_cutoff = int(input("What should the minimum number (non-inclusive) of ribonucleotides be? "))

# Ask user for if they would like to exclude any words from the PubMed abstract
excluded_words = input("List any words that you want to blacklist in the PubMed abstract. Separate words by space: ").split()

What should the maximium number (non-inclusive) of ribonucleotides be? 400
What should the minimum number (non-inclusive) of ribonucleotides be? 50
List any words that you want to blacklist in the PubMed abstract. Separate words by space: tRNA


# Extract PDB IDs from BGSU Spreadsheet

In [None]:
# Extract all PDB IDs of non-redundant RNAs from BGSU spreadsheet
bgsu_pdb_ids = []

for i, row in data.iterrows():
  pdb_id = row[0].split('|')[0] 
  bgsu_pdb_ids.append(pdb_id)

In [None]:
def pdb_info_from_entry(entry):
  # Check if of type RNA (no carbohydrate or protein) and if it is not excluded.
  # Immediately skip entry if it contains the word tRNA or other excluded words
  if contains_rna_only(entry) and not pdb_id_contains_excluded_words(pdb_id, excluded_words):
    # Get list of Chain IDs for given PDB entry
    list_of_chain_id = get_list_of_chain_id(entry)

    # Loop through each chain ID to get length
    for chain_id in list_of_chain_id:
      entity = get_pdb_entity(pdb_id, chain_id)
      length = get_polymer_length(entity)

      # Checks if chain meets min/max threshold
      if nt_min_cutoff < length < nt_max_cutoff:

        # Fetch list of all nonpolymer IDs
        list_of_nonpolymer_id = get_list_of_nonpolymer_id(entry)

        if list_of_nonpolymer_id is not None: 
          # Remove instances of ions. We want only the ligands
          ligands = nonpolymers_with_no_ions(list_of_nonpolymer_id)
          if not ligands: ligands = ['Apo'] # If no ligands found, we will call these 'Apo'
        else: 
          ligands = ['None'] # If no ions or ligands, we will call these 'None'

        # Make list of requested info (pdb_id, chain_id, length, and list_of_nonpolymers)
        list_of_nonpolymers = [pdb_id, chain_id, get_entry_title(entry), length, ligands]
        return list_of_nonpolymers

  else:
      return

# Filter all RNAs based on max/min nt cutoffs. 
Includes nonpolymers (ions/ligands). Excludes carbohydrates and proteins.

In [None]:
# Whitelist any PDB IDs that does not meet criteria or isn't found in BGSU spreadsheet.
# E.g. 4KQY 4QK8 5V3I 4P8Z 1GID
whitelist_pdb_ids = input("Please list any additional PDB IDs for which you want to include. Separate words by space: ").split()
matched_pdb_ids = bgsu_pdb_ids + whitelist_pdb_ids

benchmark_pdb_entries = []

# Loop through desired PDB IDs and if it meets criteria, add it.
for pdb_id in matched_pdb_ids:
  entry = get_pdb_entry(pdb_id) 

  # Retrieve all info needed from PDB (including name, description length, ligand)
  pdb_info = pdb_info_from_entry(entry)

  if pdb_info is not None: benchmark_pdb_entries.append(pdb_info)

# Print number of RNAs found
print()
print('Found ', len(benchmark_pdb_entries), ' molecules:')

# Take list and create DataFrame
df = pd.DataFrame(benchmark_pdb_entries, columns = ["PDB ID", "Chain_ID", "Description", "Length (bp)", "Ligand (non-ion)"])
df

Please list any additional PDB IDs for which you want to include. Separate words by space: 4KQY 4QK8 5V3I 4P8Z 1GID

Found  87  molecules:


Unnamed: 0,PDB ID,Chain_ID,Description,Length (bp),Ligand (non-ion)
0,7JNH,2,Crystal structure of a double-ENE RNA stabilit...,86,"[NCO, SPD]"
1,6PRV,1,58nt RNA L11-binding domain from E. coli 23S rRNA,58,[Apo]
2,4FEN,1,Crystal structure of the A24U/U25A/A46G mutant...,67,"[HPA, NCO]"
3,6TFF,1,Crystal structure of the ADP-binding domain of...,52,"[BR, NAD]"
4,3DIL,1,Crystal structure of the Thermotoga maritima l...,174,"[LYS, 1PE, IPA]"
...,...,...,...,...,...
82,4KQY,1,Bacillus subtilis yitJ S box/SAM-I riboswitch,119,[SAM]
83,4QK8,1,Thermoanaerobacter pseudethanolicus c-di-AMP r...,122,[2BA]
84,5V3I,1,Crystal structure of the VS ribozyme - wild-ty...,186,[None]
85,4P8Z,1,Speciation of a group I intron into a lariat c...,188,[None]


# Export dataframe to CSV

In [1]:
df.to_excel('Benchmark_RNAs' + '_' + str(nt_min_cutoff) + '_to_' + str(nt_max_cutoff) + '_nts' + '.xlsx', index=False)

NameError: ignored

**Helper function to determine list of ligands present in BGSU dataset:**

In [None]:
list_of_nonpolymer_id = []
list_of_nonpolymers = {}

for pdb_id in matching_pdb_ids:
  entry = get_pdb_entry(pdb_id) 
  list_of_nonpolymer_id = get_list_of_nonpolymer_id(entry)

  for nonpolymer_id in list_of_nonpolymer_id:
    comp_id = comp_id_from_pdb_and_nonpolymer_id(pdb_id, nonpolymer_id)
    comp_name = comp_name_from_pdb_and_nonpolymer_id(pdb_id, nonpolymer_id)

    if comp_id not in list_of_nonpolymers:
        list_of_nonpolymers[comp_id] = comp_name

# Print number of RNAs found
print()
print('Found ', len(list_of_nonpolymers), ' nonpolymers')
print(list_of_nonpolymers)


Found  24  nonpolymers
{'LYS': 'LYSINE', 'K': 'POTASSIUM ION', 'NA': 'SODIUM ION', 'MG': 'MAGNESIUM ION', '1PE': 'PENTAETHYLENE GLYCOL', 'IPA': 'ISOPROPYL ALCOHOL', 'NCO': 'COBALT HEXAMMINE(III)', 'THF': '5-HYDROXYMETHYLENE-6-HYDROFOLIC ACID', 'SAM': 'S-ADENOSYLMETHIONINE', 'SR': 'STRONTIUM ION', 'GTP': "GUANOSINE-5'-TRIPHOSPHATE", 'BA': 'BARIUM ION', 'SO4': 'SULFATE ION', 'EPE': '4-(2-HYDROXYETHYL)-1-PIPERAZINE ETHANESULFONIC ACID', 'SPM': 'SPERMINE', 'MN': 'MANGANESE (II) ION', '2BA': "(2R,3R,3aS,5R,7aR,9R,10R,10aS,12R,14aR)-2,9-bis(6-amino-9H-purin-9-yl)octahydro-2H,7H-difuro[3,2-d:3',2'-j][1,3,7,9,2,8]tetraoxadiphosphacyclododecine-3,5,10,12-tetrol 5,12-dioxide", 'PRP': '1-O-pyrophosphono-5-O-phosphono-alpha-D-ribofuranose', 'IRI': 'IRIDIUM HEXAMMINE ION', 'CD': 'CADMIUM ION', 'ACT': 'ACETATE ION', 'G4P': "GUANOSINE-5',3'-TETRAPHOSPHATE", 'TL': 'THALLIUM (I) ION', 'MES': '2-(N-MORPHOLINO)-ETHANESULFONIC ACID'}


**Manual Search to see if PDB ID matches criteria**

In [None]:
# Determine if a certain PDB ID passes all criteria.
pdb_id = input('Please type the PDB ID you want to search: ')
print()

entry = get_pdb_entry(pdb_id)
print('Contains RNA only: ', contains_rna_only(entry))
print('Contains excluded words (tRNA): ', pdb_id_contains_excluded_words(pdb_id, excluded_words))

list_of_chain_id = get_list_of_chain_id(entry)
for chain_id in list_of_chain_id:
  entity = get_pdb_entity(pdb_id, chain_id)
  print('Chain', chain_id, 'contains desired nt range:', nt_min_cutoff < get_polymer_length(entity) < nt_max_cutoff, "because it is", get_polymer_length(entity), "nt long")
  print(entry)