# Extract cancer variant reads for alignment visualization

#### Requirements: kb, samtools, and bowtie2

In [27]:
# Path to COSMIC vk build data frame
cosmic_gget_mut_df = "/home/jrich/data/varseek_data/reference/cosmic/CancerMutationCensus_AllData_Tsv_v100_GRCh37/CancerMutationCensus_AllData_v100_GRCh37_mutation_workflow_with_cdna.csv"
vcrs_fa_path = "/home/jrich/data/varseek_data_fresh/vk_build_pipeline_grch37_ensembl93/vcrs_filtered.fa"
base_out = "/home/jrich/data/varseek_data_fresh/alignment_visualization"
index = "/home/jrich/data/varseek_data_fresh/vk_build_pipeline_grch37_ensembl93/mutation_reference.idx"
t2g = "/home/jrich/data/varseek_data_fresh/vk_build_pipeline_grch37_ensembl93/t2g_filtered.txt"
k = "59"

# RNAseq of OE21_OESOPHAGUS
fastq1 = "/home/COH_data/cancer_variants_stuff/cancer_cell_lines/SRR8615227_1.fastq.gz"
fastq2 = "/home/COH_data/cancer_variants_stuff/cancer_cell_lines/SRR8615227_2.fastq.gz"

# Path to where shortened copy of fastq1 will be saved (only used to test this notebook)
fastq1_short = f"{base_out}/SRR8615227_1_short.fastq.gz"
fastq2_short = f"{base_out}/SRR8615227_2_short.fastq.gz"

# Directory to save bowtie index in
b_index = f"{base_out}/bowtie_indices"

# Directory to save kb count output in
out = f"{base_out}/SRR8615227"

# Directory to save kb extract output in
extract_out = f"{base_out}/SRR8615227/extract_out"

bowtie2 = "/home/jrich/opt/bowtie2-2.5.4/bowtie2-2.5.4-linux-x86_64/bowtie2"
bowtie_out = f"{base_out}/bowtie_alignments"
samtools = "/home/jrich/miniconda3/envs/kvar/bin/samtools"

In [10]:
import anndata
import os
import numpy as np
import pandas as pd
import pyfastx
# pd.set_option('display.max_columns', None)

threads = 16

In [None]:
# Create new file keeping only first X reads (to test this workflow)
!zcat $fastq1 | head -8000000 > $fastq1_short
# !zcat $fastq2 | head -8000000 > $fastq2_short

### Align test dataset to cancer index

In [13]:
%%time

# running only 1 fastq and doing single-end mode because kb extract only takes in 1 fastq; also, using short fastq instead of full one so that I extract the top reads within this fastq, not overall (both of these things differ from original run - see git history before 11/11 for original)

!kb count \
    -t $threads \
    -i $index \
    -g $t2g \
    -x bulk \
    -k $k \
    --h5ad \
    --parity single \
    -o $out \
    $fastq1_short

[2024-11-05 16:20:11,794]    INFO [count] Using index /home/jrich/data/varseek_data_fresh/vk_build_pipeline/mutation_reference.idx to generate BUS file to /home/jrich/data/varseek_data_fresh/alignment_visualization/SRR8615227 from
[2024-11-05 16:20:11,794]    INFO [count]         /home/COH_data/cancer_variants_stuff/cancer_cell_lines/SRR8615227_1.fastq.gz
[2024-11-05 16:20:11,794]    INFO [count]         /home/COH_data/cancer_variants_stuff/cancer_cell_lines/SRR8615227_2.fastq.gz
[2024-11-05 16:27:11,115]    INFO [count] Sorting BUS file /home/jrich/data/varseek_data_fresh/alignment_visualization/SRR8615227/output.bus to /home/jrich/data/varseek_data_fresh/alignment_visualization/SRR8615227/tmp/output.s.bus
[2024-11-05 16:27:14,825]    INFO [count] Inspecting BUS file /home/jrich/data/varseek_data_fresh/alignment_visualization/SRR8615227/tmp/output.s.bus
[2024-11-05 16:27:15,928]    INFO [count] Generating count matrix /home/jrich/data/varseek_data_fresh/alignment_visualization/SRR8615

### Pull out reads for top 10 mutants

Identify top 10 mutants:

In [15]:
adata = anndata.read_h5ad(f"{out}/counts_unfiltered/adata.h5ad")
adata

AnnData object with n_obs × n_vars = 1 × 5259437

In [15]:
top10 = adata.var.index[np.argsort(np.array(adata.X.todense())[0])][::-1][:10]
top10

Index(['seq_0688768', 'seq_3311555', 'seq_5257480', 'seq_5188714',
       'seq_1338451', 'seq_5257481', 'seq_1668855', 'seq_1668852',
       'seq_4327812', 'seq_1668681'],
      dtype='object', name='gene_id')

Show metadata for top 10 mutations:

In [28]:
cosmic_df = pd.read_csv(cosmic_gget_mut_df)
cosmic_df["seq_ID:mutation"] = cosmic_df["seq_ID"] + ":" + cosmic_df["mutation"]

  cosmic_df = pd.read_csv(cosmic_gget_mut_df)


In [18]:
file_path = "/home/jrich/data/varseek_data_fresh/vk_build_pipeline_grch37_ensembl93/variants_updated_filtered.csv"
columns_to_load = ["header_cds", "vcrs_id"]

metadata_df = pd.read_csv(file_path, usecols=columns_to_load)

# Function to process each entry
def process_header_cds(s):
    s = s.strip("[]'")  # Remove brackets and quotes
    entries = s.split(", ")  # Split by comma and space
    return ';'.join(entries)  # Join with semicolon

# Apply the function to the column
metadata_df['header_cds'] = metadata_df['header_cds'].apply(process_header_cds).astype(str)

id_to_cds_header_dict = dict(zip(metadata_df['vcrs_id'], metadata_df['header_cds']))
top10_headers = [id_to_cds_header_dict[x] for x in top10]

# Convert the dictionary to a DataFrame
id_to_cds_header_df = pd.DataFrame(list(id_to_cds_header_dict.items()), columns=['vcrs_id', 'seq_ID:mutation'])

# Merge with cosmic_df
cosmic_df = cosmic_df.merge(id_to_cds_header_df, on='seq_ID:mutation', how='left')

In [31]:
cosmic_df[cosmic_df["seq_ID:mutation"].isin(top10_headers)]

Unnamed: 0,seq_ID,mutation_cds,mutation_aa,GENOMIC_MUTATION_ID,chromosome,strand,mutation_genome,gene_name,mutation_id,mutation,seq_ID:mutation,vcrs_id
689965,ENST00000362079,c.174G>A,p.W58=,COSV104668806,MT,+,g.9380G>A,MT-CO3,37406576,c.174G>A,ENST00000362079:c.174G>A,seq_0688768
1341346,ENST00000331825,c.163T>C,p.L55=,COSV53169473,19,+,g.49469087T>C,FTL,26232420,c.370T>C,ENST00000331825:c.163T>C,seq_1338451
1673880,ENST00000252242,c.351C>T,p.L117=,COSV52862629,12,-,g.52913730G>A,KRT5,22187999,c.742C>T,ENST00000252242:c.351C>T,seq_1668681
1674052,ENST00000252242,c.732G>A,p.L244=,COSV107254094,12,-,g.52912768C>T,KRT5,22187988,c.1123G>A,ENST00000252242:c.732G>A,seq_1668852
1674055,ENST00000252242,c.1065A>C,p.T355=,COSV52861331,12,-,g.52911401T>G,KRT5,22187981,c.1456A>C,ENST00000252242:c.1065A>C,seq_1668855
3331623,ENST00000361390,c.609G>A,p.G203=,COSV62293499,MT,+,g.3915G>A,MT-ND1,38370056,c.609G>A,ENST00000361390:c.609G>A,seq_3311555
4357198,ENST00000330722,c.819T>C,p.D273=,COSV58104299,12,-,g.52884735A>G,KRT6A,29208107,c.888T>C,ENST00000330722:c.819T>C,seq_4327812
5229642,ENST00000361390,c.189dup,p.A64Rfs*47,COSV106517221,MT,,g.3495_3496dup,MT-ND1,38371054,c.189dup,ENST00000361390:c.189dup,seq_5188714
5298953,ENST00000361624,c.516del,p.K172Nfs*?,COSV107450106,MT,,g.6419del,MT-CO1,35129715,c.516del,ENST00000361624:c.516del,seq_5257480
5298954,ENST00000361624,c.795dup,p.E266Rfs*?,COSV104419744,MT,,g.6698_6699dup,MT-CO1,35129625,c.795dup,ENST00000361624:c.795dup,seq_5257481


In [69]:
# TODO: replace this with my bustools df command

targets = " ".join(top10)

# Replace fastq1_short with fastq1 to extract all reads
!kb extract \
    -t $threads \
    -g $t2g \
    -i $index \
    -k $k \
    -ts $targets \
    -ttype transcript \
    -o $extract_out \
    $fastq1_short

[2024-11-05 17:30:34,496]    INFO [extract] Performing alignment using kallisto...
[2024-11-05 17:30:34,497]    INFO [extract] Using index /home/jrich/data/varseek_data_fresh/vk_build_pipeline/mutation_reference.idx to generate BUS file to /home/jrich/data/varseek_data_fresh/alignment_visualization/SRR8615227/extract_out/tmp from
[2024-11-05 17:30:34,497]    INFO [extract]         /home/jrich/data/varseek_data_fresh/alignment_visualization/SRR8615227_1_short.fastq.gz
[2024-11-05 17:31:18,044]    INFO [extract] Alignment complete. Beginning extraction of reads using bustools...
[2024-11-05 18:00:28,579]    INFO [extract] Extracting reads for the following transcript: seq_0688768
[2024-11-05 18:00:28,579]    INFO [extract] Capturing records from BUS file /home/jrich/data/varseek_data_fresh/alignment_visualization/SRR8615227/extract_out/tmp/output.bus to /home/jrich/data/varseek_data_fresh/alignment_visualization/SRR8615227/extract_out/tmp/output_extracted_seq_0688768.bus with capture lis

### Align extracted reads with bowtie2 to create bam file for alignment visualization

#### Get/build bowtie2 index for human genome:

I opted for downloading the pre-generated bowtie2 index for 'Human / GRCh38 no-alt analysis set' from https://benlangmead.github.io/aws-indexes/bowtie (05/24/2024).

In [71]:
os.makedirs(b_index, exist_ok=True)
if not os.path.exists(f"{b_index}/GRCh38_noalt_as"):
    !cd $b_index && curl -O https://genome-idx.s3.amazonaws.com/bt/GRCh38_noalt_as.zip && unzip GRCh38_noalt_as.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 3575M  100 3575M    0     0  9690k      0  0:06:17  0:06:17 --:--:--  9.8M75M    2 88.9M    0     0  9107k      0  0:06:42  0:00:09  0:06:33  9.8M35 3575M   35 1256M    0     0  8750k      0  0:06:58  0:02:26  0:04:32  9.7M8 3575M   38 1374M    0     0  8795k      0  0:06:56  0:02:40  0:04:16 9745k3575M   84 3016M    0     0  9532k      0  0:06:24  0:05:23  0:01:01 9462k3575M   91 3260M    0     0  9621k      0  0:06:20  0:05:46  0:00:34 8340kM    0     0  9695k      0  0:06:17  0:06:05  0:00:12 11.1M7M
Archive:  GRCh38_noalt_as.zip
   creating: GRCh38_noalt_as/
  inflating: GRCh38_noalt_as/GRCh38_noalt_as.3.bt2  
  inflating: GRCh38_noalt_as/GRCh38_noalt_as.rev.2.bt2  
  inflating: GRCh38_noalt_as/GRCh38_noalt_as.rev.1.bt2  
  inflating: GRCh38_noalt_as/GRCh38_noalt_as.1.bt2  
  inflating: GRCh38_noalt_as/GRCh38_noalt_as.2.bt

Align pulled out reads to human genome to generate bam files:

In [14]:
%%time
import glob
data = extract_out

vcrs_fa = pyfastx.Fasta(vcrs_fa_path, build_index=True)

os.makedirs(bowtie_out, exist_ok=True)

for folder in glob.glob(data + "/*/"):
    variant = folder.split("/")[-2]
    print("\n" + variant)

    outfolder = f"{bowtie_out}/{variant}"
    !mkdir -p $outfolder
    
    # Align reads to human ref using bowtie2
    variant_sam = variant + ".sam"
    !$bowtie2 \
        --very-sensitive \
        -k 1 \
        -x $b_index/GRCh38_noalt_as/GRCh38_noalt_as \
        -p $threads \
        -q $data/$variant/1.fastq.gz \
        -S $outfolder/$variant_sam

    # Convert sam to bam
    variant_bam = variant + ".bam"
    !$samtools view \
        -bS -F4 $outfolder/$variant_sam \
        > $outfolder/$variant_bam
    
    # Sort bam file
    variant_bam_sorted = variant + "_sorted.bam"
    !$samtools sort \
        $outfolder/$variant_bam \
        -o $outfolder/$variant_bam_sorted
    
    # Create an index for the sorted bam file (creates a .bai file)
    !$samtools index $outfolder/$variant_bam_sorted




    # repeat but for vcrs
    outfolder_vcrs = f"{bowtie_out}/{variant}_vcrs"
    !mkdir -p $outfolder_vcrs
    variant_reference_sequence = vcrs_fa[variant].seq.strip()

    # Align reads to human ref using bowtie2
    variant_sam = variant + ".sam"
    !$bowtie2 \
        --very-sensitive \
        -k 1 \
        -x $b_index/GRCh38_noalt_as/GRCh38_noalt_as \
        -p $threads \
        -c $variant_reference_sequence \
        -S $outfolder_vcrs/$variant_sam

    # Convert sam to bam
    variant_bam = variant + ".bam"
    !$samtools view \
        -bS -F4 $outfolder_vcrs/$variant_sam \
        > $outfolder_vcrs/$variant_bam

    # Sort bam file
    variant_bam_sorted = variant + "_sorted_vcrs.bam"
    !$samtools sort \
        $outfolder_vcrs/$variant_bam \
        -o $outfolder_vcrs/$variant_bam_sorted

    # Create an index for the sorted bam file (creates a .bai file)
    !$samtools index $outfolder_vcrs/$variant_bam_sorted


seq_5257481
7681 reads; of these:
  7681 (100.00%) were unpaired; of these:
    35 (0.46%) aligned 0 times
    7646 (99.54%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
99.54% overall alignment rate
1 reads; of these:
  1 (100.00%) were unpaired; of these:
    0 (0.00%) aligned 0 times
    1 (100.00%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
100.00% overall alignment rate

seq_5257480
19 reads; of these:
  19 (100.00%) were unpaired; of these:
    1 (5.26%) aligned 0 times
    18 (94.74%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
94.74% overall alignment rate
1 reads; of these:
  1 (100.00%) were unpaired; of these:
    0 (0.00%) aligned 0 times
    1 (100.00%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
100.00% overall alignment rate

seq_0688768
2458 reads; of these:
  2458 (100.00%) were unpaired; of these:
    199 (8.10%) aligned 0 times
    2259 (91.90%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
91.90% overall alignment rat

#### Bam files downloaded to local computer with `scp -r laura@dator:/home/laura/projects/cancer_variants/data/bowtie_alignments ./` and visualized using NCBI Genome workbench

for me: scp -r jrich@dator:/home/jrich/data/varseek_data_fresh/alignment_visualization/bowtie_alignments ./