### Translate fasta containing amino acid sequences to comma-free code:

In [17]:
%load_ext blackcellmagic
from Bio import SeqIO

In [None]:
# comme-free code AA dictionary
cfcode = {
    "F": "ACC",
    "L": "ACA",
    "I": "ATA",
    "M": "ATC",
    "V": "ATT",
    "S": "CTA",
    "P": "CTC",
    "T": "CTT",
    "A": "AGA",
    "Y": "AGC",
    "H": "AGT",
    "Q": "AGG",
    "N": "CGA", 
    "K": "CGC",
    "D": "CGT",
    "E": "CGG",
    "C": "TGA",
    "W": "TGC",
    "R": "TGT",
    "G": "TGG",
    "X": "NNN",  # Amino acid not known
    "B": "CGT",  # Represents either N or D - will translate as D here (N is only off by one base)
    "J": "ACA",  # Represents either L or I - will translate as L here (I is only off by one base)
    "Z": "CGG"   # Represents either E or Q - will translate as E here (Q is only off by one base)
}

In [None]:
fasta = "../../../Pachter_lab/comma-free/palmdb/2021-03-14/uniques.fa"

ids = []
cfc_seqs = []
for record in SeqIO.parse(fasta, "fasta"):
    # Translate AA sequence to comma free
    cfc_seq_temp = []
    for aa in record.seq:
        cfc_seq_temp.append(cfcode[aa])

    # Store cfc sequences and IDs in list
    cfc_seqs.append("".join(cfc_seq_temp))
    ids.append(record.id)

KeyboardInterrupt: 

In [None]:
# Check if all IDs are unique
print("IDs are unique: ",len(ids) == len(set(ids)))
# Check if all sequences are unique
print("Sequences are unique: ", len(cfc_seqs) == len(set(cfc_seqs)))
print("Number of shared sequences: ", len(cfc_seqs) - len(set(cfc_seqs)))

IDs are unique:  True
Sequences are unique:  False
Number of shared sequences:  62


### Build dna and gtf files

In [2]:
path_to_folder = "../../../Pachter_lab/comma-free/palmdb/2021-03-14"

In [None]:
with open(f"{path_to_folder}/cfc_palmdb_annotation.gtf", "w") as gtf, open(
    f"{path_to_folder}/cfc_palmdb_genome.fa", "w"
) as dna:
    genome_name = "CFCpalmdb1"

    # Add header lines to GTF
    gtf.write(
        f"#!genome-build {genome_name}.1\n#!genome-version {genome_name}\n#!genome-date 2021-03-14\n#!genome-build-accession {genome_name}\n#!genebuild-last-updated 2021-03-14\n")

    start = 1
    chromosome = 1
    for cfc_seq, id in zip(cfc_seqs, ids):
        source = "palmdb"
        features = ["gene", "transcript", "exon", "CDS"]
        frames = [".", ".", ".", "0"]
        end = start  + len(cfc_seq)
        gene_id = id

        for feature, frame in zip(features, frames):
            if feature == "gene":
                gtf.write(
                    f'{chromosome}\t{source}\t{feature}\t{start} {end} .\t+\t{frame}\tgene_id "{gene_id}"; gene_version "1"; gene_name "{gene_id}"; gene_source "palmdb"; gene_biotype "protein_coding";\n'
                    )
            if feature == "transcript":
                gtf.write(
                    f'{chromosome}\t{source}\t{feature}\t{start} {end} .\t+\t{frame}\tgene_id "{gene_id}"; gene_version "1"; transcript_id "{gene_id}T"; transcript_version "1"; gene_name "{gene_id}"; gene_source "palmdb"; gene_biotype "protein_coding"; transcript_name "{gene_id}"; transcript_source "palmdb"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS{gene_id}"; tag "basic";\n'
                )
            if feature == "exon":
                gtf.write(
                    f'{chromosome}\t{source}\t{feature}\t{start} {end} .\t+\t{frame}\tgene_id "{gene_id}"; gene_version "1"; transcript_id "{gene_id}T"; transcript_version "1"; exon_number "1"; gene_name "{gene_id}"; gene_source "palmdb"; gene_biotype "protein_coding"; transcript_name "{gene_id}"; transcript_source "palmdb"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS{gene_id}"; exon_id "{gene_id}E"; exon_version "1"; tag "basic";\n'
                )
            if feature == "CDS":
                gtf.write(
                    f'{chromosome}\t{source}\t{feature}\t{start} {end} .\t+\t{frame}\tgene_id "{gene_id}"; gene_version "1"; transcript_id "{gene_id}T"; transcript_version "1"; exon_number "1"; gene_name "{gene_id}"; gene_source "palmdb"; gene_biotype "protein_coding"; transcript_name "{gene_id}"; transcript_source "palmdb"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS{gene_id}"; protein_id "{gene_id}P"; protein_version "1"; tag "basic";\n'
                    )

        # Build dna/genome file
        dna.write(f">{chromosome} dna:chromosome chromosome:{genome_name}:{chromosome}:{start}:{end}:1 REF\n")
        dna.write(f"{cfc_seq}\n")

        # Add next PALM sequence to new chromosome
        chromosome += 1

Create t2g:

In [None]:
# !cat uniques.fa | awk '{if($1~">") print $1"\t"$1}' > cfc_palmdb_t2g.txt
# !sed -i'.original' 's/>u//g' cfc_palmdb_t2g.txt
# !rm cfc_palmdb_t2g.txt.original

## Build index and bus file

In [None]:
# # Generate index with kb (DOES NOT WORK - DELANEY SAYS HE HAS THE SAME PROBLEM WITH KB REF)
# !kb ref \
#     -i $path_to_folder/kb/cfc_palmdb_kb_index.idx \
#     -g $path_to_folder/kb/cfc_palmdb_t2g.txt \
#     -f1 $path_to_folder/kb/cfc_palmdb_transcriptome.fa \
#     $path_to_folder/cfc_palmdb_genome.fa \
#     $path_to_folder/cfc_palmdb_annotation.gtf

In [None]:
# Generate kallisto index
!kallisto index \
    -i $path_to_folder/cfc_palmdb_index.idx \
    $path_to_folder/cfc_palmdb_genome.fa


[build] loading fasta file ../../../Pachter_lab/comma-free/palmdb/2021-03-14/cfc_palmdb_genome.fa
[build] k-mer length: 31
        with pseudorandom nucleotides
KmerStream::KmerStream(): Start computing k-mer cardinality estimations (1/2)
KmerStream::KmerStream(): Start computing k-mer cardinality estimations (1/2)
KmerStream::KmerStream(): Finished
CompactedDBG::build(): Estimated number of k-mers occurring at least once: 38457212
CompactedDBG::build(): Estimated number of minimizer occurring at least once: 8030027
CompactedDBG::filter(): Processed 88645809 k-mers in 296623 reads
CompactedDBG::filter(): Found 38301304 unique k-mers
CompactedDBG::filter(): Number of blocks in Bloom filter is 262893
CompactedDBG::construct(): Extract approximate unitigs (1/2)
CompactedDBG::construct(): Extract approximate unitigs (2/2)
CompactedDBG::construct(): Closed all input files

CompactedDBG::construct(): Splitting unitigs (1/2)

CompactedDBG::construct(): Splitting unitigs (2/2)
CompactedDBG::c

In [1]:
path_to_folder = "../../../Pachter_lab/comma-free/palmdb/2021-03-14"
out_folder = '../../../Pachter_lab/comma-free'
fastq_folder = '../../../Pachter_lab/comma-free/test_files'

In [39]:
!kallisto bus \
    -i $path_to_folder/cfc_palmdb_index.idx \
    -o $out_folder \
    -x 10xv2 \
    --cfc \
        -n \
    $fastq_folder/SRR9887677_1_modified.fastq \
    $fastq_folder/SRR9887677_2_modified.fastq


[bus] Note: Strand option was not specified; setting it to --fr-stranded for specified technology
[quant] will process sample 1: ../../../Pachter_lab/comma-free/test_files/SRR9887677_1_modified.fastq
                               ../../../Pachter_lab/comma-free/test_files/SRR9887677_2_modified.fastq
[quant] finding pseudoalignments for the reads ...Aligned frame: 0
Aligned frame: 0
Aligned frame: 0
 done
[quant] processed 20 reads, 3 reads pseudoaligned



In [22]:
!bustools sort -o $out_folder/output_sorted.bus $out_folder/output.bus

Read in 3 BUS records


In [40]:
# Read bus file
!bustools text -pf $out_folder/output.bus

TCACTTCGACTCACTA	CAGGGCGGCA	0	1	6
ACAATACGACTCACTA	CAGGGGGGTT	1	1	7
GTGGTACGACTCACTA	CAGGGACGTT	2	1	11
Read in 3 BUS records


In [24]:
# # Read bus file (output from single frame run)
# !bustools text -p $out_folder/single_frame_test/output_sorted.bus

In [31]:
!bustools count \
    -o $out_folder/bustools_count/ \
    -g $path_to_folder/cfc_palmdb_t2g.txt \
    -e $out_folder/matrix.ec \
    -t $out_folder/transcripts.txt \
    $out_folder/output_sorted.bus

In [32]:
from scipy.io import mmread
mtx = mmread(f"{out_folder}/bustools_count/output.mtx")
mtx.todense()

matrix([[0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 0.]])

In [33]:
mtx

<3x3 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in COOrdinate format>