In [2]:
pip install biopython

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83


# Assignment 1


---


**Part 1:**

The first part of the assignment is to download the sequences for 16 chromosomes of the '**Saccharomyces Cerevisiae**'. The sequences are written to a file named '**chromsomeN.fasta**' in fasta format. The sequences are also stored in a dictionary called **chromosome_sequences**.


**Part 2:**

The second part of the assignment is to find ORI's in the organism. Since ORI's have some features called **Autonomously Replicating Sequence** (ARS). Using several forms of the ACS sequences referenced from multiple research papers I was able to find out the indices where the ARS was most likely to occur. Now, from research I found out that ORI's were likely to occur at the start of the ARS

In [3]:
from Bio import Entrez
Entrez.email = "noel22338@iiitd.ac.in"
handle = Entrez.einfo()
record = Entrez.read(handle)


In [4]:
def download_fasta(email, accession_number, output_file):
    Entrez.email = email
    handle = Entrez.efetch(db = 'nucleotide', id = accession_number, rettype = 'fasta', retmode = 'text')
    sequence = handle.read()
    with open(output_file, 'w') as f:
        f.write(sequence)

# The above function takes arguments and and writes the sequence to an output file

In [5]:
# Q1:

# The below dictionary contains the accession numbers of the 16 chromosomes present in 'Saccharomyces Cerevisiae'.
# The dictionary is iterated and the corresponding sequence of each chromosome is stored in a file of format (key).fasta. Example for key "chromosome1" the output file is "chromosome1.fasta"


email = "noel22338@iiitd.ac.in"
accession_numbers = {
    'chromosome1':'NC_001133.9' ,
    'chromosome2':'NC_001134.8' ,
    'chromosome3':'NC_001135.5' ,
    'chromosome4':'NC_001136.10' ,
    'chromosome5':'NC_001137.3' ,
    'chromosome6':'NC_001138.5' ,
    'chromosome7':'NC_001139.9' ,
    'chromosome8':'NC_001140.6' ,
    'chromosome9':'NC_001141.2' ,
    'chromosome10':'NC_001142.9',
    'chromosome11':'NC_001143.9' ,
    'chromosome12':'NC_001144.5' ,
    'chromosome13':'NC_001145.3' ,
    'chromosome14':'NC_001146.8' ,
    'chromosome15':'NC_001147.6' ,
    'chromosome16':'NC_001148.4'
}

email = 'noel22338@iiitd.ac.in'
for chromosome, accession_number in accession_numbers.items():
  output_file = f"{chromosome}.fasta"
  download_fasta(email, accession_number, output_file)


In [6]:
# The below block of code prints the content in each file as well as makes the dictionary with each chromosome as a key corresponding to its sequence as a value

from Bio import SeqIO
chromosome_sequences = {}
for i in range(1,17):
  for seq_record in list(SeqIO.parse(f"chromosome{i}.fasta", "fasta")):
      print(seq_record.id)
      print(len(seq_record.seq))
      chromosome_sequences[f"chromosome{i}"] = seq_record.seq
      g = seq_record.count("G")
      c = seq_record.count("C")

      per = (g + c)/len(seq_record)*100 #Stability of the molecuole
      print (per)
      print()



NC_001133.9
230218
39.27017001277051

NC_001134.8
813184
38.34101507161971

NC_001135.5
316620
38.53231002463521

NC_001136.10
1531933
37.90642280047496

NC_001137.3
576874
38.50736902685855

NC_001138.5
270161
38.728758036874304

NC_001139.9
1090940
38.06139659376318

NC_001140.6
562643
38.49510257836675

NC_001141.2
439888
38.902175099116135

NC_001142.9
745751
38.37326399830506

NC_001143.9
666816
38.069422449371345

NC_001144.5
1078177
38.47633551819414

NC_001145.3
924431
38.20360848997924

NC_001146.8
784333
38.6379254729815

NC_001147.6
1091291
38.16021574447145

NC_001148.4
948066
38.0644385517464



In the research paper it was mentioned that two functional domains within ARS1 namely Domain A and B.

Domain A constitutes the 11 bp sequence whose consensus **(A/T)TTTAT(A/G)TTT(A/T)** is referred to as ACS.

When additional ACSs were identified, a 17 bp extended ACS (EACS) was defined **(A/T)(A/T)(A/T)-{ACS}-(G/T)(A/T)(A/T)** (Heichinger et al., 2006).

The above references were made from this
[research paper](https://www.sciencedirect.com/science/article/pii/S0923250812000435#bib72)


In [7]:
ACS_sequence = [['A', 'T'], ['T'], ['T'], ['T'], ['A'], ['T'], ['A', 'G'], ['T'], ['T'], ['T'], ['A', 'T']]
EACS_sequence = [['A', 'T'], ['A', 'T'], ['A', 'T'], ['A', 'T'], ['T'], ['T'], ['T'], ['A'], ['T'], ['A', 'G'], ['T'], ['T'], ['T'], ['A', 'T'], ['G','T'], ['A', 'T'], ['A', 'T']]
print(f"""
ACS_Sequence length : {len(ACS_sequence)}
EACS_Sequence length : {len(EACS_sequence)}
""")



ACS_Sequence length : 11
EACS_Sequence length : 17



In [8]:
print(chromosome_sequences)

{'chromosome1': Seq('CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACA...GGG'), 'chromosome2': Seq('AAATAGCCCTCATGTACGTCTCCTCCAAGCCCTGTTGTCTCTTACCCGGATGTT...TGT'), 'chromosome3': Seq('CCCACACACCACACCCACACCACACCCACACACCACACACACCACACCCACACA...GTG'), 'chromosome4': Seq('ACACCACACCCACACCACACCCACACACACCACACCCACACACCACACCCACAC...TGG'), 'chromosome5': Seq('CGTCTCCTCCAAGCCCTGTTGTCTCTTACCCGGATGTTCAACCAAAAGCTACTT...TTT'), 'chromosome6': Seq('GATCTCGCAAGTGCATTCCTAGACTTAATTCATATCTGCTCCTCAACTGTCGAT...TGG'), 'chromosome7': Seq('CCACACCCACACACACCACACCCACACCCACACACTACCCTAACACTACCCTAA...TTT'), 'chromosome8': Seq('CCCACACACACCACACCCACACACCACACCCACACTTTTCACATCTACCTCTAC...TGG'), 'chromosome9': Seq('CACACACACCACACCCACACCACACCACACCACACCCACACCCACACACACCAC...TGT'), 'chromosome10': Seq('CCCACACACACACCACACCCACACCCACACACACCACACCCACACACCACACCC...TGT'), 'chromosome11': Seq('CACCACACCCACACACCACACCCACACACACACCACACCCACACACCACACCCA...TGT'), 'chromosome12': Seq('CACACACACACACCACCCACACACCACACCACACACCACACCACACACCACA

In [9]:
ACS_count = {}
ACS_indices = {}
for chromosome, sequence in chromosome_sequences.items():
  count = 0
  indices = []
  for index in range(0, len(sequence) - 10):
    status = True
    start_index = index
    for seq_index in range(0,11):
      if sequence[index + seq_index] not in ACS_sequence[seq_index]:
        status = False
    if (status):
        count+=1
        indices.append(start_index)

  ACS_count[chromosome] = count
  ACS_indices[chromosome] = indices




In [10]:
print(ACS_count)
print(sum(ACS_count.values()))
print(ACS_indices)

{'chromosome1': 6, 'chromosome2': 17, 'chromosome3': 8, 'chromosome4': 38, 'chromosome5': 19, 'chromosome6': 3, 'chromosome7': 22, 'chromosome8': 11, 'chromosome9': 16, 'chromosome10': 18, 'chromosome11': 18, 'chromosome12': 23, 'chromosome13': 23, 'chromosome14': 17, 'chromosome15': 20, 'chromosome16': 21}
280
{'chromosome1': [17149, 159953, 176236, 176522, 208605, 229450], 'chromosome2': [80, 122598, 195767, 238293, 326080, 368745, 381151, 420235, 543395, 568821, 603190, 622760, 632052, 665038, 755032, 792466, 812416], 'chromosome3': [11256, 14700, 74520, 78863, 201845, 224863, 231261, 315820], 'chromosome4': [50459, 67634, 77223, 104908, 111128, 210566, 232057, 233925, 263124, 340870, 347217, 405175, 420761, 427871, 443872, 477645, 480280, 521602, 521761, 561437, 609151, 677939, 709270, 807779, 913867, 1057898, 1070495, 1182775, 1240933, 1272225, 1398457, 1404336, 1422530, 1445629, 1447485, 1462061, 1462567, 1524662], 'chromosome5': [64, 7976, 16057, 49778, 59536, 99492, 105316, 109

In [11]:
EACS_count = {}
EACS_indices = {}
for chromosome, sequence in chromosome_sequences.items():
  count = 0
  indices = []
  for index in range(0, len(sequence) - 16):
    status = True
    start_index = index
    for seq_index in range(0,17):
      if sequence[index + seq_index] not in EACS_sequence[seq_index]:
        status = False
    if (status):
        count+=1
        indices.append(start_index)
  EACS_count[chromosome] = count
  EACS_indices[chromosome] = indices

In [12]:
print(EACS_count)
print(sum(EACS_count.values()))
print(EACS_indices)

{'chromosome1': 1, 'chromosome2': 2, 'chromosome3': 1, 'chromosome4': 5, 'chromosome5': 1, 'chromosome6': 0, 'chromosome7': 3, 'chromosome8': 2, 'chromosome9': 3, 'chromosome10': 1, 'chromosome11': 2, 'chromosome12': 0, 'chromosome13': 4, 'chromosome14': 3, 'chromosome15': 1, 'chromosome16': 2}
31
{'chromosome1': [176519], 'chromosome2': [122595, 632049], 'chromosome3': [11253], 'chromosome4': [50456, 347214, 677936, 913864, 1447482], 'chromosome5': [287566], 'chromosome6': [], 'chromosome7': [43204, 203970, 728053], 'chromosome8': [147805, 479732], 'chromosome9': [214734, 355638, 357224], 'chromosome10': [445358], 'chromosome11': [98349, 458669], 'chromosome12': [], 'chromosome13': [137320, 226912, 284806, 878719], 'chromosome14': [479443, 609518, 609535], 'chromosome15': [1004264], 'chromosome16': [116592, 458606]}


WWWWTTTAYRTTTWGTT, where W = A or T, Y = C or T, and R = A or G

The above version of the ACS_sequence consisting of 17 base pairs is referenced from the below research paper.
ARSs contain the 17 bp ARS consensus sequence (ACS), which binds the origin recognition complex.

The above sequence was referenced from this
[research paper](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2004-5-4-r22#:~:text=Autonomously%20replicating%20sequences%20(ARSs)%20function,binds%20the%20origin%20recognition%20complex.)

In [13]:
# WWWWTTTAYRTTTWGTT, where W = A or T, Y = C or T, and R = A or G

# The above version of the ACS_sequence consisting of 17 base pairs is referenced from the below research paper.
# ARSs contain the 17 bp ARS consensus sequence (ACS), which binds the origin recognition complex.

# https://genomebiology.biomedcentral.com/articles/10.1186/gb-2004-5-4-r22#:~:text=Autonomously%20replicating%20sequences%20(ARSs)%20function,binds%20the%20origin%20recognition%20complex.

ACS_17_initial = 'WWWWTTTAYRTTTWGTT'
ACS_17_sequence = []

for i in ACS_17_initial:
  if i == "W":
    ACS_17_sequence.append(['A','T'])
  elif i == "Y":
    ACS_17_sequence.append(['C','T'])
  elif i == "R":
    ACS_17_sequence.append(['A','G'])
  else:
    ACS_17_sequence.append(list(i))

ACS_17_count = {}
ACS_17_indices = {}
for chromosome, sequence in chromosome_sequences.items():
  count = 0
  indices = []
  for index in range(0, len(sequence) - 16):
    status = True
    start_index = index
    for seq_index in range(0,17):
      if sequence[index + seq_index] not in ACS_17_sequence[seq_index]:
        status = False
    if (status):
        count+=1
        indices.append(start_index)
  ACS_17_count[chromosome] = count
  ACS_17_indices[chromosome] = indices


In [15]:
print(ACS_17_count)
print(sum(ACS_17_count.values()))
print(ACS_17_indices)

{'chromosome1': 0, 'chromosome2': 0, 'chromosome3': 0, 'chromosome4': 2, 'chromosome5': 1, 'chromosome6': 0, 'chromosome7': 0, 'chromosome8': 0, 'chromosome9': 1, 'chromosome10': 0, 'chromosome11': 1, 'chromosome12': 0, 'chromosome13': 1, 'chromosome14': 0, 'chromosome15': 0, 'chromosome16': 1}
7
{'chromosome1': [], 'chromosome2': [], 'chromosome3': [], 'chromosome4': [913864, 1447482], 'chromosome5': [287566], 'chromosome6': [], 'chromosome7': [], 'chromosome8': [], 'chromosome9': [357224], 'chromosome10': [], 'chromosome11': [581895], 'chromosome12': [], 'chromosome13': [137320], 'chromosome14': [], 'chromosome15': [], 'chromosome16': [116592]}
