### Translate fasta containing amino acid sequences to comma-free code:

In [1]:
%load_ext blackcellmagic
from Bio import SeqIO

In [2]:
# comme-free code AA dictionary
cfcode = {
    "F": "ACC",
    "L": "ACA",
    "I": "ATA",
    "M": "ATC",
    "V": "ATT",
    "S": "CTA",
    "P": "CTC",
    "T": "CTT",
    "A": "AGA",
    "Y": "AGC",
    "H": "AGT",
    "Q": "AGG",
    "N": "CGA", 
    "K": "CGC",
    "D": "CGT",
    "E": "CGG",
    "C": "TGA",
    "W": "TGC",
    "R": "TGT",
    "G": "TGG",
    "X": "NNN",  # Amino acid not known
    "B": "CGT",  # Represents either N or D - will translate as D here (N is only off by one base)
    "J": "ACA",  # Represents either L or I - will translate as L here (I is only off by one base)
    "Z": "CGG"   # Represents either E or Q - will translate as E here (I is only off by one base)
}

In [3]:
fasta = "../../../Pachter_lab/comma-free/palmdb/2021-03-14/uniques.fa"

ids = []
cfc_seqs = []
for record in SeqIO.parse(fasta, "fasta"):
    # Translate AA sequence to comma free
    cfc_seq_temp = []
    for aa in record.seq:
        cfc_seq_temp.append(cfcode[aa])

    # Store cfc sequences and IDs in list
    cfc_seqs.append("".join(cfc_seq_temp))
    ids.append(record.id)

In [19]:
# Check if all IDs are unique
print("IDs are unique: ",len(ids) == len(set(ids)))
# Check if all sequences are unique
print("Sequences are unique: ", len(cfc_seqs) == len(set(cfc_seqs)))
print("Number of shared sequences: ", len(cfc_seqs) - len(set(cfc_seqs)))

IDs are unique:  True
Sequences are unique:  False
Number of shared sequences:  62


### Build dna and gtf files

In [27]:
path_to_folder = "../../../Pachter_lab/comma-free/palmdb/2021-03-14"

In [73]:
with open(f"{path_to_folder}/cfc_palmdb_annotation.gtf", "w") as gtf, open(
    f"{path_to_folder}/cfc_palmdb_genome.fa", "w"
) as dna:
    genome_name = "CFCpalmdb1"

    # Add header lines to GTF
    gtf.write(
        f"#!genome-build {genome_name}.1\n#!genome-version {genome_name}\n#!genome-date 2021-03-14\n#!genome-build-accession {genome_name}\n#!genebuild-last-updated 2021-03-14\n")

    start = 1
    chromosome = 1
    for cfc_seq, id in zip(cfc_seqs, ids):
        source = "palmdb"
        features = ["gene", "transcript", "exon", "CDS"]
        frames = [".", ".", ".", "0"]
        end = start  + len(cfc_seq)
        gene_id = id

        for feature, frame in zip(features, frames):
            if feature == "gene":
                gtf.write(
                    f'{chromosome}\t{source}\t{feature}\t{start} {end} .\t+\t{frame}\tgene_id "{gene_id}"; gene_version "1"; gene_name "{gene_id}"; gene_source "palmdb"; gene_biotype "protein_coding";\n'
                    )
            if feature == "transcript":
                gtf.write(
                    f'{chromosome}\t{source}\t{feature}\t{start} {end} .\t+\t{frame}\tgene_id "{gene_id}"; gene_version "1"; transcript_id "{gene_id}T"; transcript_version "1"; gene_name "{gene_id}"; gene_source "palmdb"; gene_biotype "protein_coding"; transcript_name "{gene_id}"; transcript_source "palmdb"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS{gene_id}"; tag "basic";\n'
                )
            if feature == "exon":
                gtf.write(
                    f'{chromosome}\t{source}\t{feature}\t{start} {end} .\t+\t{frame}\tgene_id "{gene_id}"; gene_version "1"; transcript_id "{gene_id}T"; transcript_version "1"; exon_number "1"; gene_name "{gene_id}"; gene_source "palmdb"; gene_biotype "protein_coding"; transcript_name "{gene_id}"; transcript_source "palmdb"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS{gene_id}"; exon_id "{gene_id}E"; exon_version "1"; tag "basic";\n'
                )
            if feature == "CDS":
                gtf.write(
                    f'{chromosome}\t{source}\t{feature}\t{start} {end} .\t+\t{frame}\tgene_id "{gene_id}"; gene_version "1"; transcript_id "{gene_id}T"; transcript_version "1"; exon_number "1"; gene_name "{gene_id}"; gene_source "palmdb"; gene_biotype "protein_coding"; transcript_name "{gene_id}"; transcript_source "palmdb"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS{gene_id}"; protein_id "{gene_id}P"; protein_version "1"; tag "basic";\n'
                    )

        # Build dna/genome file
        dna.write(f">{chromosome} dna:chromosome chromosome:{genome_name}:{chromosome}:{start}:{end}:1 REF\n")
        dna.write(f"{cfc_seq}\n")

        # Add next PALM sequence to new chromosome
        chromosome += 1

### Build index with kb ref

In [78]:
!kb ref \
    -i $path_to_folder/cfc_palmdb_index.idx \
    -g $path_to_folder/cfc_palmdb_t2g.txt \
    -f1 $path_to_folder/cfc_palmdb_transcriptome.fa \
    $path_to_folder/cfc_palmdb_genome.fa \
    $path_to_folder/cfc_palmdb_annotation.gtf

[2022-11-02 16:24:21,691]    INFO [ref] Preparing ../../../Pachter_lab/comma-free/palmdb/2021-03-14/cfc_palmdb_genome.fa, ../../../Pachter_lab/comma-free/palmdb/2021-03-14/cfc_palmdb_annotation.gtf
[2022-11-02 16:24:56,948]    INFO [ref] Splitting genome ../../../Pachter_lab/comma-free/palmdb/2021-03-14/cfc_palmdb_genome.fa into cDNA at /Users/lauraluebbert/OneDrive - California Institute of Technology/Data_analysis/kallisto-bf/src/tmp/tmp7gce4cz3


In [77]:
# !kallisto index \
#     -i $path_to_folder/cfc_palmdb_index.idx \
#     $path_to_folder/cfc_palmdb_genome.fa

/bin/bash: kallisto: command not found
