In [1]:
# Gene sequence
gene_sequence = "AAACCACATTGGGTTTCTGGTCCTGTACGTGAGGCTCAAGGCCAGATCTTTTTTTTTTTTTTTTTTGAGACAGAGTCTCACTCTGTCGCCAGGCTGGAGTGGAGTGGTGCGATCTCTGCTCACTGCAACCTCCGCCTCCCAGGTTCAAGCGATTCTCCTGCCTCAGCCTCCTGAGTAGCTGGGATTACAGGCACACGCCACCACACCCAGCTAATTTCTGTATTTTCAGTAGAGACGGGATTTCACCATGTTGGCCAGGATGGTCTCAATCTCTTGACCTAGTGATCCGCCCACCTCGGCCTCTGAAAGTGCTGGGATTACAGGCATGAGCCACCGTGCCCAGCCTAGCCTTATTTTTATCACCTCAACTGTTTCACAGCCTGTCATCATATTTTCCCTCAGCTTGTTTCTTTCCAGCCTCAAACATTCTATTCCTTTTGTTTGGCCTTCTGTTCCTTCATCTTTGGAGCTGACTCCATAAAGTAGGACA"


In [2]:
# Step 1: Get how many unique characters
unique_characters = set(gene_sequence)
num_unique_characters = len(unique_characters)
print(f"Number of unique characters: {num_unique_characters}")

Number of unique characters: 4


In [3]:
#Count the occurrences of A, T, C, G individually
# This section counts the occurrences of each nucleotide (A, T, C, G) 
# in the 'gene_sequence' string using the count() method.
count_A = gene_sequence.count('A') 
count_T = gene_sequence.count('T')   
count_C = gene_sequence.count('C')  
count_G = gene_sequence.count('G')  

# The counts are then stored in a dictionary called 'sequence_count' 
# for easier access, with each nucleotide as a key and its count as the value.
sequence_count = {'A': count_A, 'T': count_T, 'C': count_C, 'G': count_G}

# Finally, it prints the dictionary containing the character counts.
print("Character counts:", sequence_count)


Character counts: {'A': 95, 'T': 154, 'C': 140, 'G': 101}


In [4]:
# Initialize a dictionary to count occurrences of each character in the sequence
char_count = {}

# Loop through the gene sequence and count each character
for char in gene_sequence:
    if char in char_count:
        char_count[char] += 1
    else:
        char_count[char] = 1  # Initialize the character count if it's not already in the dictionary

# Now calculate the proportion for each character
char_proportion = {}
total_length = len(gene_sequence)

# Use a for loop to calculate the proportion for each character
for char, count in char_count.items():
    char_proportion[char] = count / total_length

# Print the dictionary containing the character proportions
print("Character proportions:", char_proportion)


Character proportions: {'A': 0.19387755102040816, 'C': 0.2857142857142857, 'T': 0.3142857142857143, 'G': 0.20612244897959184}


In [5]:
# Step 4: Print characters and proportions with string formatting 
# (using the format specifier .2f for floating-point numbers).
for char, proportion in char_proportion.items():
    print(f"Character: {char}, Proportion: {proportion:.2f}")


Character: A, Proportion: 0.19
Character: C, Proportion: 0.29
Character: T, Proportion: 0.31
Character: G, Proportion: 0.21


In [6]:
# Step 5: Calculate the characters with minimum proportions
# This line creates a list comprehension that identifies characters in 'char_proportion'
# that have the minimum proportion value. It iterates over each character and its
# corresponding proportion, checking if the proportion equals the minimum value 
# found in 'char_proportion.values()'.
# The resulting list of characters with the minimum proportions is stored in 'min_proportion_chars'.
# Finally, it prints the list of characters with the minimum proportions.
min_proportion_chars = [char for char, proportion in char_proportion.items() if proportion == min(char_proportion.values())]
print(f"Characters with minimum proportions: {min_proportion_chars}")



Characters with minimum proportions: ['A']


In [7]:
# Step 6: Find and change the pattern "CCTGT" to lowercase
# This line uses the replace() method to find all occurrences of the pattern "CCTGT"
# in the string 'gene_sequence' and replace them with "cctgt" (the lowercase version).
# The modified sequence is stored in the variable 'modified_sequence'.
# Finally, it prints the modified sequence with "CCTGT" changed to lowercase.
modified_sequence = gene_sequence.replace("CCTGT", "cctgt")
print("Modified sequence:", modified_sequence)


Modified sequence: AAACCACATTGGGTTTCTGGTcctgtACGTGAGGCTCAAGGCCAGATCTTTTTTTTTTTTTTTTTTGAGACAGAGTCTCACTCTGTCGCCAGGCTGGAGTGGAGTGGTGCGATCTCTGCTCACTGCAACCTCCGCCTCCCAGGTTCAAGCGATTCTCCTGCCTCAGCCTCCTGAGTAGCTGGGATTACAGGCACACGCCACCACACCCAGCTAATTTCTGTATTTTCAGTAGAGACGGGATTTCACCATGTTGGCCAGGATGGTCTCAATCTCTTGACCTAGTGATCCGCCCACCTCGGCCTCTGAAAGTGCTGGGATTACAGGCATGAGCCACCGTGCCCAGCCTAGCCTTATTTTTATCACCTCAACTGTTTCACAGcctgtCATCATATTTTCCCTCAGCTTGTTTCTTTCCAGCCTCAAACATTCTATTCCTTTTGTTTGGCCTTCTGTTCCTTCATCTTTGGAGCTGACTCCATAAAGTAGGACA


In [8]:
# Step 7: Print all the position coordinates of "CCTGT"
import re
# This line uses the re.finditer() function to find all occurrences of the pattern "CCTGT"

# of each match using match.start() and storing them in 'pattern_positions'.
# Finally, it prints the list of positions where the pattern "CCTGT" was found in 'gene_sequence'.

pattern_positions = [match.start() for match in re.finditer("CCTGT", gene_sequence)]
print("Positions of 'CCTGT':", pattern_positions)


Positions of 'CCTGT': [21, 379]


In [9]:
# Step 8: Delete the first 10 characters and the last 10 characters
trimmed_sequence = modified_sequence[10:-10]# This line slices the 'modified_sequence' from index 10 to -10.
# It creates a new sequence, 'trimmed_sequence', that excludes
# the first 10 elements and the last 10 elements of 'modified_sequence'.
print("Trimmed sequence:", trimmed_sequence)  # Outputs the trimmed sequence.


Trimmed sequence: GGGTTTCTGGTcctgtACGTGAGGCTCAAGGCCAGATCTTTTTTTTTTTTTTTTTTGAGACAGAGTCTCACTCTGTCGCCAGGCTGGAGTGGAGTGGTGCGATCTCTGCTCACTGCAACCTCCGCCTCCCAGGTTCAAGCGATTCTCCTGCCTCAGCCTCCTGAGTAGCTGGGATTACAGGCACACGCCACCACACCCAGCTAATTTCTGTATTTTCAGTAGAGACGGGATTTCACCATGTTGGCCAGGATGGTCTCAATCTCTTGACCTAGTGATCCGCCCACCTCGGCCTCTGAAAGTGCTGGGATTACAGGCATGAGCCACCGTGCCCAGCCTAGCCTTATTTTTATCACCTCAACTGTTTCACAGcctgtCATCATATTTTCCCTCAGCTTGTTTCTTTCCAGCCTCAAACATTCTATTCCTTTTGTTTGGCCTTCTGTTCCTTCATCTTTGGAGCTGACTCCATA
