In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from pathlib import Path

## SSM Dataset

In [10]:
def generate_substitutions(original_str, nucleotides, max_length, overlap):
    substitutions = []
    names = []
    variant = []

    start = 0
    while start <= len(original_str)-max_length:
        shift_str = str(start) + '-' + str(start+max_length)
        substr = original_str[start:start+max_length]
        # Original strings shifted
        names.append('original' + ':' '' + ':' + shift_str)
        substitutions.append(substr)

        # Variant strings shifted
        for i in range(max_length):
            for nucleotide in nucleotides:
                if substr[i] != nucleotide:
                    new_name = str(start+i) + ':' + nucleotide + ':' + shift_str
                    new_str = substr[:i] + nucleotide + substr[i+1:]
                    names.append(new_name)
                    substitutions.append(new_str)

        start += overlap
    
    return names, substitutions

albumin_promoter = 'GCAACATTAATCTAAAAACCTCAATGACCCAAATAGAAAAGGAAAAAAGCTTTCTGAACAGCCAAACAGAGATTCCAAAGTTCAGGCACCAAAGTTCAGACCCTAACAGTTATTTACAAGGGTCAGTTAACCTTTTGTTCTAGTGGGATAGGATTCCCATGGTGAGATTTGAAGAACTGTGGAACCACTGTTTCAGGTTC'
albumin_enhancer = 'ATATATCAAAATAAACTTGAGGGGATAGATCATTTTCATGATATATGAGAAAAATTAAAAATCAGATTGAATTATTTGCCTGTCATACAGCTAATAATTGACCATAAGACAATTAGATTTAAATTAGTTTTGAATCTTTCTAATACCAAAGTTCAGTTTACTGTTCCATGTTGCTTCTGAGTGGCTTCACAGACTTATGAAAAAGTAAACGGAATCAGAATTACATCAATGCAAAAGCATTGCTGTGAACTCTGTACTTAGGACTAAACTTTGAGCAATAACACATATAGATTGAGGATTGTTTGCTGTTAGTATACAAACTCTGGTTCAAAGCTCCTCTTTATTGCTTGTCTTGGAAAATTTGCTGTTCTTCATGGTTTCTCTTTTCACTGCTATCTAT'
apo_mouse = 'ATTTGGACAGTGGAGCATCAACTGGGAGGCGCTATGGAGCCAGAGCCACCGAAGGCAGGTAGCAGCACTTACCTCCCCGGAGCTCTCCGACAGTCTGGGTGTCCAGCTCTTCTTCCCTGGTCTATATATGTGTGTGGGCAGGGGCTGGGCTGGGAGACTGATAAGCTCAGCCTGGCCCTGCCACTGCTTACTTTTGCTGGCGATGTGGAACTTAGAGTTCAAGGATCAGCTCTGTTCCTGAGGCTGGGCAAACAGAGTGGGCAAACAGGAAACTGCGGGGGCTGCCGAGCTGGGATCAAGGGTTCAGGTGGGGGCAGCAGGGGGTGGCTGCAGACTCCAGGGGTCCCCAGGGGTAAGGCAGTTGCACCATCCCGAGGTGTATGTCTTCTTCAAGATAGTCTCATGGAGCCCACTCCTGGACTCCTCTGTGTTGCTAGTGCCCTCTGAGCCTGGAACAAGGGAGGGAGGGAGGAATTGTGGGCTCCTTCACTCACAGCCCCACCCCAAAGTTTTTTTTTTGCTTTCCAAGGGGTTCCCCCTTTCACCTCACCCTGGTCCCCCAGGTTTGGGGATTTGTCTTTCCAGATGCCACATGTTTAGCCAGCCTCCCAACCCTAAATCCAAAAACAACTGAGGTGTCAGCCTGAGTGAGATGCAGACAGGCTCATATCCCCAAGCTGAGGATGAAGACCCCTCTGATGCTTCAGAAGAAATGACAGAGGCCAACATGGAGTTGTTAGCTCCTTCCTTGTCTCCCTCCCTCCTTCCCTCCCTTCCTTTCTTCCTTATCTTAGGTACATGAGCGCTCTGTCTTCAGACACACCAGAAGATGGAATCAGATTCCTTTACAGATGGTTGTGAGCCACCATATGGTTTCTGGGAATTGAACTCAGGACCTCT'
apo_human = 'AGGTACCCAGAGGCCCGGCCTGGGGCAAGGCCTGAACCTTGAGCTGGGGAGCCAGAGTGACCGGGGCAGGCAGCAGGACGCACCTCCTTCTCGCAGTCTCTAAGCAGCCAGCTCTTGCAGGGCCTATTTATGTCTGCAGCCAGGGTCTGGGCTGGGAGGCTGATAAGCCCAGCCCCGGCCCTGTTGCTGCTCACTGGTCCTGGCAATGTGGAACTTAAGAGTTCAAGGATCAGCTCTGTCCCTGGGGCTGGGCAAATAGAGTGGGCAAACAGCAAGCTGCGGGGGCTGCAGGGCAGGGGTCAAGGGTTCAGTGGGGGCGGGAGGGGAGTGTCTGCAGGCTTGCAGGTCTCCCGGGTGGGGTCGGGGTTCCCTGCACTCATCCCCTTCCCCTCCATGGGAGTGTGTGGGCAGTTGCCATTGTCCATTGTGTTGGCAGAGGAGGGGAGGGGAGGGACGCTGGGACTCCTCCACCAAGGAGACTGCCTCCCCCACCACCAGCATTCCAGGGAGACTACTTCACTCCCCTCCCCCTTCCCCCGCCCTGTCCTCCCACCAGTGCTCTTCTTTAGTCCCCAGCAGGTCCTCCAGGCCTCTCTCCAAGCCTCCCAAACTGGTAAACCTGGGGAGAGGGGAGAGCCCTCCGTGGCTCCCAGACTGAGGTTTCGGAGACCTCTTGCATTTCAAAACACTCCAGAGATCAATTCGGAGCTGCCAACTTTTAATTTTGTCATGTAAAGATATTGTCCGCCTCCAAAAAACCCTCACCATCTACAGTGACCATCACTTCAAAAAGGAAAGGCTTTAACAAAAAAGGGCATAATCTCAGAATTACATTACAGAATTGAAGCCCCTTAGATTGAAGACGTCTCCCTTTGCATTGTTCACACTTATATTTGATCA'
apoc3 = 'GAGCTCATCTGGGCTGCAGGGCTGGCGGGACAGCAGCGTGGACTCAGTCTCCTAGGGATTTCCCAACTCTCCCGCCCGCTTGCTGCATCTGGACACCCTGCCTCAGGCCCTCATCTCCACTGGTCAGCAGGTGACCTTTGCCCAGCGCCCTGGGTCCTCAGTGCCTGCTGCCCTGGAGATGATATAAAACAGGTCAGAAC'

apo_mod = 'AGGTACCCAGAGGCCCGGCCTGGGGCAAGGCCTGAACCTTGAGCTGGGGAGCCAGAGTGACCGGGGCAGGCAGCAGGACGCACCTCCTTCTCGCAGTCTCTAAGCAGCCAGCTCTTGCAGGGCCTATTTATGTCTGCAGCCAGGGTCTGGGCTGGGAGGCTGATTGGCCCACTTCCGGAAGTGTTGCCAATGATTGGTCCTGGCAATGATTAACTTAAGAGTTCAAGGATCAGCTCTGTCCCATTGGCTGGGCAAATAGAGTGGGCAAACAGCAAGCTGCGGGGGCTGCAGGGCAGGGGTCAAGGGTTCAGTGGGGGCGGGAGGGGAGTGTCTGCAGGCTTGCAGGTCTCCCGGGTGGGGTCGGGGTTCCCTGCACTCATCCCCTTCCCCTCCATGGGAGTGTGTGGGCAGTTGCCATTGTCCATTGTGTTGGCAGAGGAGGGGAGGGGAGGGACGCTGGGACTCCTCCACCAAGGAGACTGCCTCCCCCACCACCAGCATTCCAGGGAGACTACTTCACTCCCCTCCCCCTTCCCCCGCCCTGTCCTCCCACCAGTGCTCTTCTTTAGTCCCCAGCAGGTCCTCCAGGCCTCTCTCCAAGCCTCCCAAACTGGTAAACCTGGGGAGAGGGGAGAGCCCTCCGTGGCTCCCAGACTGAGGTTTCGGAGACCTCTTGCATTTCAAAACACTCCAGAGATCAATTCGGAGCTGCCAACTTTTAATTTTGTCATGTAAAGATATTGTCCGCCTCCAAAAAACCCTCACCATCTACAGTGACCATCACTTCAAAAAGGAAAGGCTTTAACAAAAAAGGGCATAATCTCAGAATTACATTACAGAATTGAAGCCCCTTAGATTGAAGACGTCTCCCTTTGCATTGTTCACACTTATATTTGATCA'

seq = apoc3

names, variant_seqs = generate_substitutions(seq, ['A','C','T','G'], max_length=200, overlap=100)

seq = ['AGGACCGGATCAACT' + s + 'CATTGCGTGAACCGA' for s in variant_seqs]
df = pd.DataFrame({'name':names, 'category':'variant', 'seq':seq, 'mean':0})
df.loc[0,'category'] = 'original'

## Save data to CSV

In [11]:
from pathlib import Path

# Clean data directories (copied from https://stackoverflow.com/a/57892171)
def rm_tree(pth: Path):
    for child in pth.iterdir():
        if child.is_file():
            child.unlink()
        else:
            rm_tree(child)
    pth.rmdir()

for BASE_FILE_PATH in [Path("data/apoa1")]:
    if BASE_FILE_PATH.exists():
        rm_tree(BASE_FILE_PATH)
    BASE_FILE_PATH.mkdir()

# Save data files
df.to_csv(Path('data/apoa1') / 'val.csv', index=False)

# Copy data
!cp -r data/apoa1 /data/code/hyena-dna/data/mpra_agarwal_seq/

## Replace dataset

In [23]:
import itertools
    
def generate_substitutions(original_str, replace_seq, max_length, overlap):
    substitutions = []
    names = []
    variant = []

    start = 0
    while start <= len(original_str)-max_length:
        shift_str = str(start) + '-' + str(start+max_length)
        substr = original_str[start:start+max_length]
        # Original strings shifted
        names.append('original' + ':' + ':' + shift_str)
        substitutions.append(substr)

        # Replace strings
        bases = ["A","C","T","G"]
        combinations = [''.join(comb) for comb in itertools.product(bases, repeat=len(replace_seq))]

        for i in range(0,len(substr)-len(replace_seq)):
            for seq in combinations:
                new_str = substr[:i] + seq + substr[i+len(replace_seq):]
                new_name = str(start+i) + ':' + seq + ':' + shift_str
                names.append(new_name)
                substitutions.append(new_str)

        start += overlap
    
    return names, substitutions

apo_human = 'AGGTACCCAGAGGCCCGGCCTGGGGCAAGGCCTGAACCTTGAGCTGGGGAGCCAGAGTGACCGGGGCAGGCAGCAGGACGCACCTCCTTCTCGCAGTCTCTAAGCAGCCAGCTCTTGCAGGGCCTATTTATGTCTGCAGCCAGGGTCTGGGCTGGGAGGCTGATAAGCCCAGCCCCGGCCCTGTTGCTGCTCACTGGTCCTGGCAATGTGGAACTTAAGAGTTCAAGGATCAGCTCTGTCCCTGGGGCTGGGCAAATAGAGTGGGCAAACAGCAAGCTGCGGGGGCTGCAGGGCAGGGGTCAAGGGTTCAGTGGGGGCGGGAGGGGAGTGTCTGCAGGCTTGCAGGTCTCCCGGGTGGGGTCGGGGTTCCCTGCACTCATCCCCTTCCCCTCCATGGGAGTGTGTGGGCAGTTGCCATTGTCCATTGTGTTGGCAGAGGAGGGGAGGGGAGGGACGCTGGGACTCCTCCACCAAGGAGACTGCCTCCCCCACCACCAGCATTCCAGGGAGACTACTTCACTCCCCTCCCCCTTCCCCCGCCCTGTCCTCCCACCAGTGCTCTTCTTTAGTCCCCAGCAGGTCCTCCAGGCCTCTCTCCAAGCCTCCCAAACTGGTAAACCTGGGGAGAGGGGAGAGCCCTCCGTGGCTCCCAGACTGAGGTTTCGGAGACCTCTTGCATTTCAAAACACTCCAGAGATCAATTCGGAGCTGCCAACTTTTAATTTTGTCATGTAAAGATATTGTCCGCCTCCAAAAAACCCTCACCATCTACAGTGACCATCACTTCAAAAAGGAAAGGCTTTAACAAAAAAGGGCATAATCTCAGAATTACATTACAGAATTGAAGCCCCTTAGATTGAAGACGTCTCCCTTTGCATTGTTCACACTTATATTTGATCA'
apo_triple = 'AGGTACCCAGAGGCCCGGCCTGGGGCAAGGCCTGAACCTTGAGCTGGGGAGCCAGAGTGACCGGGGCAGGCAGCAGGACGCACCTCCTTCTCGCAGTCTCTAAGCAGCCAGCTCTTGCAGGGCCTATTTATGTCTGCAGCCAGGGTCTGGGCTGATTGGCTGATTGGCCCACTTCCGGAAGTGTTGCTGCTGATTGGTCCTGGCAATGATTAACTTAAGAGTTCAAGGATCAGCTCTGTCCCTGGGGCTGGGCAAATAGAGTGGGCAAACAGCAAGCTGCGGGGGCTGCAGGGCAGGGGTCAAGGGTTCAGTGGGGGCGGGAGGGGAGTGTCTGCAGGCTTGCAGGTCTCCCGGGTGGGGTCGGGGTTCCCTGCACTCATCCCCTTCCCCTCCATGGGAGTGTGTGGGCAGTTGCCATTGTCCATTGTGTTGGCAGAGGAGGGGAGGGGAGGGACGCTGGGACTCCTCCACCAAGGAGACTGCCTCCCCCACCACCAGCATTCCAGGGAGACTACTTCACTCCCCTCCCCCTTCCCCCGCCCTGTCCTCCCACCAGTGCTCTTCTTTAGTCCCCAGCAGGTCCTCCAGGCCTCTCTCCAAGCCTCCCAAACTGGTAAACCTGGGGAGAGGGGAGAGCCCTCCGTGGCTCCCAGACTGAGGTTTCGGAGACCTCTTGCATTTCAAAACACTCCAGAGATCAATTCGGAGCTGCCAACTTTTAATTTTGTCATGTAAAGATATTGTCCGCCTCCAAAAAACCCTCACCATCTACAGTGACCATCACTTCAAAAAGGAAAGGCTTTAACAAAAAAGGGCATAATCTCAGAATTACATTACAGAATTGAAGCCCCTTAGATTGAAGACGTCTCCCTTTGCATTGTTCACACTTATATTTGATCA'
seq = apo_triple

names, variant_seqs = generate_substitutions(seq, replace_seq="NNN", max_length=200, overlap=100)

seq = ['AGGACCGGATCAACT' + s + 'CATTGCGTGAACCGA' for s in variant_seqs]
df = pd.DataFrame({'name':names, 'category':'variant', 'seq':seq, 'mean':0})
df.loc[0,'category'] = 'original'

### Save data to CSV

In [24]:
from pathlib import Path

# Clean data directories (copied from https://stackoverflow.com/a/57892171)
def rm_tree(pth: Path):
    for child in pth.iterdir():
        if child.is_file():
            child.unlink()
        else:
            rm_tree(child)
    pth.rmdir()

for BASE_FILE_PATH in [Path("data/apoa1")]:
    if BASE_FILE_PATH.exists():
        rm_tree(BASE_FILE_PATH)
    BASE_FILE_PATH.mkdir()

# Save data files
df.to_csv(Path('data/apoa1') / 'val.csv', index=False)

# Copy data
!cp -r data/apoa1 /data/code/hyena-dna/data/mpra_agarwal_seq/

## Targeted replace dataset

In [22]:
import itertools
    
def generate_substitutions(original_str, replace_seq, target_sites, max_length, overlap):
    substitutions = []
    names = []
    variant = []

    start = 0
    while start <= len(original_str)-max_length:
        shift_str = str(start) + '-' + str(start+max_length)
        substr = original_str[start:start+max_length]
        # Original strings shifted
        names.append('original' + ':' + ':' + shift_str)
        substitutions.append(substr)

        # Replace strings
        sites = target_sites[(target_sites['sp1_pos'] >= start) & (target_sites['sp2_pos'] < start+max_length)]
        for i, row in sites.iterrows():
            site_start = row['sp1_pos'] - start
            site_end = row['sp2_pos'] - start
            new_str = substr[:site_start] + replace_seq + substr[site_end+len(replace_seq):]
            new_name = str(row['sp1_pos'])+'-'+str(row['sp2_pos']) + ':' + replace_seq + ':' + shift_str
            names.append(new_name)
            substitutions.append(new_str)

        start += overlap
    
    substitutions = pd.Series(substitutions).str.pad(width=200, side='right', fillchar='N').values
    return names, substitutions

sgrna_sites = pd.read_csv('/data/code/mpra_agarwal/apoa1_sgrna.csv')

apo_human = 'AGGTACCCAGAGGCCCGGCCTGGGGCAAGGCCTGAACCTTGAGCTGGGGAGCCAGAGTGACCGGGGCAGGCAGCAGGACGCACCTCCTTCTCGCAGTCTCTAAGCAGCCAGCTCTTGCAGGGCCTATTTATGTCTGCAGCCAGGGTCTGGGCTGGGAGGCTGATAAGCCCAGCCCCGGCCCTGTTGCTGCTCACTGGTCCTGGCAATGTGGAACTTAAGAGTTCAAGGATCAGCTCTGTCCCTGGGGCTGGGCAAATAGAGTGGGCAAACAGCAAGCTGCGGGGGCTGCAGGGCAGGGGTCAAGGGTTCAGTGGGGGCGGGAGGGGAGTGTCTGCAGGCTTGCAGGTCTCCCGGGTGGGGTCGGGGTTCCCTGCACTCATCCCCTTCCCCTCCATGGGAGTGTGTGGGCAGTTGCCATTGTCCATTGTGTTGGCAGAGGAGGGGAGGGGAGGGACGCTGGGACTCCTCCACCAAGGAGACTGCCTCCCCCACCACCAGCATTCCAGGGAGACTACTTCACTCCCCTCCCCCTTCCCCCGCCCTGTCCTCCCACCAGTGCTCTTCTTTAGTCCCCAGCAGGTCCTCCAGGCCTCTCTCCAAGCCTCCCAAACTGGTAAACCTGGGGAGAGGGGAGAGCCCTCCGTGGCTCCCAGACTGAGGTTTCGGAGACCTCTTGCATTTCAAAACACTCCAGAGATCAATTCGGAGCTGCCAACTTTTAATTTTGTCATGTAAAGATATTGTCCGCCTCCAAAAAACCCTCACCATCTACAGTGACCATCACTTCAAAAAGGAAAGGCTTTAACAAAAAAGGGCATAATCTCAGAATTACATTACAGAATTGAAGCCCCTTAGATTGAAGACGTCTCCCTTTGCATTGTTCACACTTATATTTGATCA'

gabpa = 'GGAACCGGAAGTGG'

names, variant_seqs = generate_substitutions(apo_human, replace_seq=gabpa, target_sites=sgrna_sites, max_length=200, overlap=100)

seq = ['AGGACCGGATCAACT' + s + 'CATTGCGTGAACCGA' for s in variant_seqs]
df = pd.DataFrame({'name':names, 'category':'variant', 'seq':seq, 'mean':0})
df.loc[0,'category'] = 'original'

# Save data files
df.to_csv(Path('data/apoa1') / 'val.csv', index=False)

# Copy data
!cp -r data/apoa1 /data/code/hyena-dna/data/mpra_agarwal_seq/

## Global SSM datasets

In [20]:
def generate_substitutions(original_str, nucleotides, max_length, overlap):
    substitutions = []
    names = []
    variant = []

    start = 0
    while start <= len(original_str)-max_length:
        shift_str = str(start) + '-' + str(start+max_length)
        substr = original_str[start:start+max_length]
        # Original strings shifted
        names.append('original' + ':' '' + ':' + shift_str)
        substitutions.append(substr)

        # Variant strings shifted
        for i in range(max_length):
            for nucleotide in nucleotides:
                if substr[i] != nucleotide:
                    new_name = str(start+i) + ':' + nucleotide + ':' + shift_str
                    new_str = substr[:i] + nucleotide + substr[i+1:]
                    names.append(new_name)
                    substitutions.append(new_str)

        start += overlap
    
    return names, substitutions

dataset = 

names, variant_seqs = generate_substitutions(seq, ['A','C','T','G'], max_length=200, overlap=100)

seq = ['AGGACCGGATCAACT' + s + 'CATTGCGTGAACCGA' for s in variant_seqs]
df = pd.DataFrame({'name':names, 'category':'variant', 'seq':seq, 'mean':0})
df.loc[0,'category'] = 'original'

SyntaxError: invalid syntax (3829464078.py, line 27)