<a href="https://colab.research.google.com/github/philpham8/FindRepresentativeRNAChains/blob/main/FindRepresentativeRNAChains_Phillip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import pandas as pd

def json_from_url(url):
  try:
    response = requests.get(url, timeout=5)
    response.raise_for_status()
    return response.json()

  except requests.exceptions.RequestException as e: 
    raise SystemExit(e)

# Fetch information on requested PDB entry
def get_pdb_entry(pdb_id):
  request_url = "https://data.rcsb.org/rest/v1/core/entry"
  return json_from_url(request_url + '/' + pdb_id)

# Fetch information on requested PDB entity/chain (e.g. individual RNA)
def get_pdb_entity(pdb_id, chain_id):
  request_url = "https://data.rcsb.org/rest/v1/core/polymer_entity"
  return json_from_url(request_url + '/' + pdb_id + '/' + chain_id)

# Checks that the polymer is of type RNA and DOES NOT contain DNA, oligosaccharide, or protein.
def contains_rna_only(entry):
    return entry['rcsb_entry_info']['polymer_composition'] == 'RNA' and entry['rcsb_entry_info']['na_polymer_entity_types'] == 'RNA (only)'

# Checks if it contains any nonpolymer (either ligands or ions)
# TODO: Ions should be permitted while ligands should not be. Temp action is excluding both.
def contains_nonpolymer(entry):
    return entry['rcsb_entry_info']['nonpolymer_entity_count'] > 0

# Retrieve list of chain ids from entry.
def get_list_of_chain_id(entry):
    return entry['rcsb_entry_container_identifiers']['polymer_entity_ids']

# Determine polymer length given entity.
def get_polymer_length(entity):
    return entity['entity_poly']['rcsb_sample_sequence_length']


**Run following cell and input direct URL to CSV found on BGSU RNA**

In [3]:
# Ask URL to CSV and then create dataframe from .csv
# E.g. http://rna.bgsu.edu/rna3dhub/nrlist/download/3.179/3.0A/csv
url = input('Please provide direct URL to desired CSV file found on rna.bgsu.edu: ')
data = pd.read_csv(url, usecols=[1])

# Ask user for maximum nucleotides
nt_cutoff = int(input("What should the maximium number (non-inclusive) of ribonucleotides be? "))

# Running list of chains that meet criteria (no proteins, no carbohydrates, no ligand/ion/water, and less than nt_cutoff)
matching_chains = []

# Grab second column (the 'representative' column) from .csv and grab the PDB id.
# TODO: Use regex instead
for i, row in data.iterrows():
  pdb_id = row[0].split('|')[0] 
  entry = get_pdb_entry(pdb_id) 

  # Checks if RNA and exclude any nonpolymers. If true, fetch all chains and check if it is less than nt_cutoff.
  if contains_rna_only(entry) and not contains_nonpolymer(entry):

    list_of_chain_id = get_list_of_chain_id(entry)

    for chain_id in list_of_chain_id:
      entity = get_pdb_entity(pdb_id, chain_id)
      if get_polymer_length(entity) < nt_cutoff:
        matching_chains.append(pdb_id + '|' + chain_id)

# Print number of RNAs found
print()
print('Found ' + str(len(matching_chains)) + ' molecules:')
print(matching_chains)

Please provide direct URL to desired CSV file found on rna.bgsu.edu: http://rna.bgsu.edu/rna3dhub/nrlist/download/3.179/3.0A/csv
What should the maximium number (non-inclusive) of ribonucleotides be? 400

Found 88 molecules:
['205D|1', '1I9X|1', '4NFQ|1', '5UZ6|1', '5UZ6|2', '5NXT|1', '3SZX|1', '6L0Y|1', '4U35|1', '4U35|2', '2V6W|1', '2V6W|2', '5C5W|1', '438D|1', '353D|1', '353D|2', '4E59|1', '1CSL|1', '1CSL|2', '1YFG|1', '3P22|1', '3P22|2', '5G4T|1', '5G4T|2', '2G92|1', '4RC0|1', '1ZX7|1', '1ZX7|2', '472D|1', '472D|2', '4IQS|1', '1KH6|1', '2V7R|1', '2V7R|2', '1SDR|1', '1SDR|2', '1DQH|1', '1DQH|2', '1QBP|1', '6BGB|1', '1KFO|1', '422D|1', '1RNA|1', '5LQO|1', '2JLT|1', '2JLT|2', '1KD5|1', '2A0P|1', '6E7L|1', '1SA9|1', '280D|1', '3GM7|1', '2VUQ|1', '2VUQ|2', '5V1K|1', '2XSL|1', '2XSL|2', '4E6B|1', '4E6B|2', '4JRT|1', '4JRT|2', '402D|1', '3CJZ|1', '4E6B|1', '4E6B|2', '406D|1', '4P3U|1', '3HGA|1', '4U34|1', '4U34|2', '377D|1', '361D|1', '361D|1', '3P4D|1', '255D|1', '3BNN|1', '6UGG|1', '4RB

**Some helper Functions that is not needed at this time:**

In [None]:
def contains_dna(entry):
    return entry['rcsb_entry_info']['polymer_entity_count_dna'] > 0
def contains_protein(entry):
    return entry['rcsb_entry_info']['polymer_entity_count_protein'] > 0
def contains_carbohydrate(entry):
    return entry['rcsb_entry_info']['branched_entity_count'] > 0

# If entry contains nonpolymer AND a list of bound components is given, display it.
# Notice: Some entries have nonpolymer entities but fail to include a list.
def show_nonpolymer(entry):
    if contains_nonpolymer(entry) and 'nonpolymer_bound_components' in entry['rcsb_entry_info']:
        print(entry['rcsb_entry_info']['nonpolymer_bound_components'])