In [22]:
import pandas as pd
from Bio.Seq import Seq
from Bio.SeqUtils import MeltingTemp as mt

# Load the Excel file
file_path = 'C:/Users/Nelso/OneDrive/Documents/Thesis/data/'  # Update this with the path to your Excel file
file = 'cleanned_nodupes_v1.xlsx'
df = pd.read_excel(file_path + file, header=0)  # Use header=0 if column names are in the first row

# Function to replace ambiguous bases for Tm min calculation (choose weaker bonds)
def replace_ambiguous_min(sequence):
    return sequence.upper().replace('N', 'A').replace('R', 'A').replace('Y', 'T')\
                           .replace('S', 'C').replace('W', 'A').replace('K', 'T')\
                           .replace('M', 'A').replace('B', 'T').replace('D', 'A')\
                           .replace('H', 'A').replace('V', 'A')

# Function to replace ambiguous bases for Tm max calculation (choose stronger bonds)
def replace_ambiguous_max(sequence):
    return sequence.upper().replace('N', 'G').replace('R', 'G').replace('Y', 'C')\
                           .replace('S', 'G').replace('W', 'G').replace('K', 'G')\
                           .replace('M', 'C').replace('B', 'G').replace('D', 'G')\
                           .replace('H', 'C').replace('V', 'G')

# Adjusted function to calculate minimum melting temperature
def calculate_tm_min(sequence):
    if pd.isnull(sequence):
        return None
    seq = Seq(replace_ambiguous_min(sequence))
    return round(mt.Tm_Wallace(seq), 2)

# Adjusted function to calculate maximum melting temperature
def calculate_tm_max(sequence):
    if pd.isnull(sequence):
        return None
    seq = Seq(replace_ambiguous_max(sequence))
    try:
        return round(mt.Tm_NN(seq), 2)
    except ValueError as e:
        print(f"Error processing sequence {sequence}: {e}")
        return None

# Apply the functions to calculate Tm min and Tm max for each sequence
df['Tm min (°C)'] = df['Sequence'].apply(calculate_tm_min)
df['Tm max (°C)'] = df['Sequence'].apply(calculate_tm_max)

# Save the updated DataFrame back to an Excel file
output_file = 'up_' + file
output_file_path = file_path
df.to_excel(output_file_path + output_file, index=False)

print(f"Updated Excel file saved as '{output_file_path + output_file}'")


Error processing sequence CCCATCTCITCAIIATCCCTGCTGTTGG: no thermodynamic data for neighbors 'II/II' available
Error processing sequence GAYYTIGGITGYGGIIGIGGIRGITGG: no thermodynamic data for neighbors 'II/II' available
Updated Excel file saved as 'C:/Users/Nelso/OneDrive/Documents/Thesis/data/up_cleanned_nodupes_v1.xlsx'


In [5]:
import pandas as pd
from Bio.Seq import Seq
from Bio.SeqUtils import MeltingTemp as mt
from itertools import product

# Load the Excel file
print("Loading Excel file...")
file_path = 'C:/Users/Nelso/OneDrive/Documents/Thesis/data/'
file = 'cleanned_nodupes_v1.xlsx'
df = pd.read_excel(file_path + file, header=0)
print("Excel file loaded.")

# Ambiguous base mapping to possible replacements
ambiguous_base_map = {
    'N': ['A', 'T', 'G', 'C'],
    'R': ['A', 'G'],
    'Y': ['C', 'T'],
    'S': ['G', 'C'],
    'W': ['A', 'T'],
    'K': ['G', 'T'],
    'M': ['A', 'C'],
    'B': ['C', 'G', 'T'],
    'D': ['A', 'G', 'T'],
    'H': ['A', 'C', 'T'],
    'V': ['A', 'C', 'G'],
    'I': ['A']  # Handling 'I' by replacing it with 'A'; adjust as needed
}

def generate_sequences(sequence):
    cleaned_sequence = sequence.upper().replace('I', 'A')  # Replace 'I' with 'A'; adjust as needed
    positions = [ambiguous_base_map.get(base, [base]) for base in cleaned_sequence]
    return [''.join(seq) for seq in product(*positions)]

def calculate_tm(sequence, method):
    seq = Seq(sequence)
    if method == 'min':
        return round(mt.Tm_Wallace(seq), 2)
    elif method == 'max':
        try:
            return round(mt.Tm_NN(seq), 2)
        except ValueError as e:
            print(f"Error processing sequence {sequence}: {e}")
            return None

# Function to save DataFrame to CSV, splitting into multiple files if necessary
def save_to_csv(df, base_file_path, max_rows=1048576):
    num_rows = len(df)
    chunks = num_rows // max_rows + (1 if num_rows % max_rows > 0 else 0)
    
    for i in range(chunks):
        print(f"Saving chunk {i + 1} of {chunks}...")
        chunk_start = i * max_rows
        chunk_end = min((i + 1) * max_rows, num_rows)  # Ensure we don't go beyond the DataFrame's length
        chunk = df.iloc[chunk_start:chunk_end]
        chunk_file = f"{base_file_path}_part_{i + 1}.csv"
        chunk.to_csv(chunk_file, index=False)
        print(f"Chunk {i + 1} saved as '{chunk_file}'")

# Expanded DataFrame to hold sequence variants and their Tm calculations
expanded_df_rows = []

print("Processing sequences...")
for index, row in df.iterrows():
    if pd.isnull(row['Sequence']):
        continue
    sequence_variants = generate_sequences(row['Sequence'])
    for variant in sequence_variants:
        tm_min = calculate_tm(variant, 'min')
        tm_max = calculate_tm(variant, 'max')
        expanded_df_rows.append({'Original Sequence': row['Sequence'], 'Sequence Variant': variant, 'Tm min (°C)': tm_min, 'Tm max (°C)': tm_max})
    # Progress tracker
    if (index + 1) % 100 == 0:
        print(f"Processed {index + 1} sequences...")

expanded_df = pd.DataFrame(expanded_df_rows)
print("Sequence processing complete.")

# Save the updated DataFrame to CSV, handling large data by splitting into multiple files
print("Saving to CSV files...")
save_to_csv(expanded_df, file_path + 'up_' + file[:-5])  # Remove '.xlsx' from the original file name for the base path
print("All data saved to CSV.")


Loading Excel file...
Excel file loaded.
Processing sequences...
Processed 100 sequences...
Processed 200 sequences...
Processed 300 sequences...
Processed 400 sequences...
Processed 500 sequences...
Processed 600 sequences...
Processed 700 sequences...
Processed 800 sequences...
Processed 900 sequences...
Processed 1000 sequences...
Processed 1100 sequences...
Processed 1200 sequences...
Sequence processing complete.
Saving to CSV files...
Saving chunk 1 of 9...
Chunk 1 saved as 'C:/Users/Nelso/OneDrive/Documents/Thesis/data/up_cleanned_nodupes_v1_part_1.csv'
Saving chunk 2 of 9...
Chunk 2 saved as 'C:/Users/Nelso/OneDrive/Documents/Thesis/data/up_cleanned_nodupes_v1_part_2.csv'
Saving chunk 3 of 9...
Chunk 3 saved as 'C:/Users/Nelso/OneDrive/Documents/Thesis/data/up_cleanned_nodupes_v1_part_3.csv'
Saving chunk 4 of 9...
Chunk 4 saved as 'C:/Users/Nelso/OneDrive/Documents/Thesis/data/up_cleanned_nodupes_v1_part_4.csv'
Saving chunk 5 of 9...
Chunk 5 saved as 'C:/Users/Nelso/OneDrive/Do

In [9]:
import pandas as pd
from Bio.Seq import Seq
from Bio.SeqUtils import MeltingTemp as mt
from itertools import product

# Load the Excel file
file_path = 'C:/Users/Nelso/OneDrive/Documents/Thesis/data/'
file = 'cleanned_nodupes_v1.xlsx'
df = pd.read_excel(file_path + file, header=0)

# Ambiguous base mappings
ambiguous_bases = {
    'N': ['A', 'T', 'G', 'C'],
    'R': ['A', 'G'],
    'Y': ['C', 'T'],
    'S': ['G', 'C'],
    'W': ['A', 'T'],
    'K': ['G', 'T'],
    'M': ['A', 'C'],
    'B': ['C', 'G', 'T'],
    'D': ['A', 'G', 'T'],
    'H': ['A', 'C', 'T'],
    'V': ['A', 'C', 'G']
}

def generate_sequences(sequence):
    sequence = sequence.upper()
    combinations = [ambiguous_bases.get(nuc, [nuc]) for nuc in sequence]
    return [''.join(seq) for seq in product(*combinations)]

def calculate_tm_avg(sequence, tm_type):
    sequences = generate_sequences(sequence)
    tm_values = []
    for seq in sequences:
        try:
            if tm_type == 'min':
                tm_values.append(mt.Tm_Wallace(Seq(seq)))
            elif tm_type == 'max':
                tm_values.append(mt.Tm_NN(Seq(seq)))
        except ValueError as e:
            # Skip sequences that cause errors
            print(f"Skipping sequence {seq} due to error: {e}")
            continue

    if tm_values:  # Check if the list is not empty
        return round(sum(tm_values) / len(tm_values), 2)
    return None


# Apply the function to calculate the average Tm min and Tm max for each sequence
df['Tm_min'] = df['Sequence'].apply(lambda x: calculate_tm_avg(x, 'min'))
df['Tm_max'] = df['Sequence'].apply(lambda x: calculate_tm_avg(x, 'max'))

# Save the updated DataFrame to a CSV file
output_file = ('primer_metadata.csv')  # Change the file extension to .csv
output_file_path = file_path
df.to_csv(output_file_path + output_file, index=False)

print(f"Updated CSV file saved as '{output_file_path + output_file}'")

Skipping sequence CCCATCTCITCAIIATCCCTGCTGTTGG due to error: no thermodynamic data for neighbors 'II/II' available
Skipping sequence GACCTIGGITGCGGIIGIGGIAGITGG due to error: no thermodynamic data for neighbors 'II/II' available
Skipping sequence GACCTIGGITGCGGIIGIGGIGGITGG due to error: no thermodynamic data for neighbors 'II/II' available
Skipping sequence GACCTIGGITGTGGIIGIGGIAGITGG due to error: no thermodynamic data for neighbors 'II/II' available
Skipping sequence GACCTIGGITGTGGIIGIGGIGGITGG due to error: no thermodynamic data for neighbors 'II/II' available
Skipping sequence GACTTIGGITGCGGIIGIGGIAGITGG due to error: no thermodynamic data for neighbors 'II/II' available
Skipping sequence GACTTIGGITGCGGIIGIGGIGGITGG due to error: no thermodynamic data for neighbors 'II/II' available
Skipping sequence GACTTIGGITGTGGIIGIGGIAGITGG due to error: no thermodynamic data for neighbors 'II/II' available
Skipping sequence GACTTIGGITGTGGIIGIGGIGGITGG due to error: no thermodynamic data for n