In [43]:
import pandas as pd
from collections import Counter

# Read the dataset
df = pd.read_csv('synthetic_dna_dataset.csv')

def calculate_kmer_freq(sequence, k=3, top_n=16):
    kmers = [sequence[i:i+k] for i in range(len(sequence) - k + 1)]
    kmer_counts = Counter(kmers)
    top_sum = sum(count for kmer, count in kmer_counts.most_common(top_n))
    
    ratio = top_sum / len(kmers)
    
    return round(ratio, 3)

# Calculate for all sequences
df['calculated_kmer3_freq'] = df['Sequence'].apply(lambda x: calculate_kmer_freq(x, k=3))

# Compare with existing column
# print("Sample_ID | Original | Calculated | Difference")
# print("-" * 50)
# for _, row in df.iterrows():
#     diff = abs(row['kmer_3_freq'] - row['calculated_kmer3_freq'])
#     print(f"{row['Sample_ID']:9} | {row['kmer_3_freq']:.3f}     | {row['calculated_kmer3_freq']:.3f}      | {diff:.3f}")

In [44]:
# Read the dataset

# def calculate_kmer_freq(sequence, k=4, top_n=16):
#     """Calculate k-mer frequency as sum of top N k-mers / total possible k-mers"""
#     # Generate all k-mers
#     kmers = [sequence[i:i+k] for i in range(len(sequence) - k + 1)]
    
#     # Count frequencies
#     kmer_counts = Counter(kmers)
    
#     # Sum top N most frequent k-mers
#     top_sum = sum(count for kmer, count in kmer_counts.most_common(top_n))
    
#     # Calculate ratio
#     return top_sum / len(kmers)


# Calculate for all sequences
df['calculated_kmer4_freq'] = df['Sequence'].apply(lambda x: calculate_kmer_freq(x, k=4))

# Compare with existing column
# print("Sample_ID | Original | Calculated | Difference")
# print("-" * 50)
# for _, row in df.iterrows():
#     diff = abs(row['kmer_3_freq'] - row['calculated_kmer4_freq'])
#     print(f"{row['Sample_ID']:9} | {row['kmer_3_freq']:.3f}     | {row['calculated_kmer4_freq']:.3f}      | {diff:.3f}")


In [45]:
def longest_repeat_with_base(sequence):
    # Returns:
    # length of the longest contiguous repeat
    # nucleotide responsible for that repeat
    max_run = 1
    current_run = 1
    max_base = sequence[0]

    for i in range(1, len(sequence)):
        if sequence[i] == sequence[i - 1]:
            current_run += 1
            if current_run > max_run:
                max_run = current_run
                max_base = sequence[i]
        else:
            current_run = 1

    return max_run, max_base

#The algorithm is mentioned in the report draft

In [46]:
df[["Longest_Repeat_Length", "Longest_Repeat_Base"]] = (
    df["Sequence"]
    .apply(longest_repeat_with_base)
    .apply(pd.Series)
)

df[["Sequence", "Longest_Repeat_Length", "Longest_Repeat_Base"]]


Unnamed: 0,Sequence,Longest_Repeat_Length,Longest_Repeat_Base
0,CTTTCGGGATACTTTTGGGATGGTCTTGGTCAAGGGTTTTAGCCCG...,4,T
1,TTGACCAAATTTGATTGGAAGTGGTAAGCGCGTATTCCTAGCATCA...,5,T
2,GCGTGAGTTCTAATTTAAAAAGTCGTAACACGTACCCCGGCGTGTA...,5,A
3,ACTACGCGGACAAGAACCAACAGAACCTGGTTTTCGCAAGGGAGTG...,4,T
4,TTCAATGCAGATTGAAAGTTACTTTCATCTGCCCTATGGGTCCCTT...,3,A
...,...,...,...
2995,GATCAGCCCATACACCAAATCAATTGCATACATGTCCGATGTAACA...,3,C
2996,TGTTGTGTGTCTGATGATAGGTCATACCGCCTCGAAACATCACCAT...,4,A
2997,GACCCACTAAAAGTCTTCGTCTCCTTCCGATGGGAATTTTCGCCGA...,4,A
2998,CCAAAGGATATCTGTAATTGTTGCAGCGCCCCTACAATTTGAGCAC...,4,C


In [47]:
target_col = "Disease_Risk" 

new_features = [
    "calculated_kmer3_freq",
    "calculated_kmer4_freq",
    "Longest_Repeat_Length",
    "Longest_Repeat_Base"
]

other_cols = [col for col in df.columns if col not in new_features + [target_col]]

df = df[other_cols + new_features + [target_col]]

print("Columns reordered successfully!")


output_file = "synthetic_dna_dataset_engineered.csv"
df.to_csv(output_file, index=False)

print("Dataset saved successfully with correct column order!")
print("Final shape:", df.shape)

Columns reordered successfully!
Dataset saved successfully with correct column order!
Final shape: (3000, 17)
