# Day 2: SQANTI3 Practice Session
see https://github.com/ConesaLab/SQANTI3


# Setup

In [93]:
# imports
import os
import subprocess
import pandas as pd

In [None]:
# initialize integrated IGV viewer in the notebook
import igv_notebook
igv_notebook.init()

## Installing SQANTI3

In [94]:
sqanti_path = os.path.abspath(os.path.expanduser("~/tools/SQANTI3-5.4"))

In [95]:
%%script echo Skipping SQANTI3 installation
# How to install SQANTI3?
# see also: https://github.com/ConesaLab/SQANTI3/wiki/Dependencies-and-installation
sqanti_zip = "~/downloads/SQANTI3_v5.4.zip"
sqanti_env = "sus_sqanti"
!wget https://github.com/ConesaLab/SQANTI3/releases/download/v5.4/SQANTI3_v5.4.zip -O $sqanti_zip
!rm -rf $sqanti_path
!mkdir -p $sqanti_path
!unzip $sqanti_zip -d $sqanti_path
!mv $sqanti_path/release_sqanti3/* $sqanti_path/
!rm -r $sqanti_path/release_sqanti3
# run in parent shell (outside of this notebook)
# !micromamba env create -f $sqanti_path/SQANTI3.conda_env.yml -n $sqanti_env
# !micromamba activate $sqanti_env
# additional requirements for this course
# !micromamba install -y ipykernel

Skipping SQANTI3 installation


## Preparing paths to data ...

In [96]:
n_cores = 8
data_dir = "/mnt/c/data/lrgasp/homo_sapiens"
base_sus_path = os.path.abspath(os.path.expanduser("../../data"))
sample_name = "h1_endo_chr8_isotools"

# reference
ref_genome = f"{data_dir}/reference/chr8/GRCh38.primary_assembly.genome_chr8.fa"
ref_gtf = f"{data_dir}/reference/chr8/gencode.v45.annotation_chr8.gtf"
ref_qc_prefix = "gencode.v45.annotation_chr8_qc"
ref_qc_dir = f"{base_sus_path}/sqanti_qc/gencode.v45.annotation_chr8"

# main data
input_gtf = f"{data_dir}/isotools/h1_endo_chr8_all.gtf"
qc_prefix = f"{sample_name}_qc"
qc_dir = f"{base_sus_path}/sqanti_qc/{sample_name}"
filter_prefix = f"{sample_name}_filter"
filter_dir = f"{base_sus_path}/sqanti_filter/{sample_name}"
rescue_prefix = f"{sample_name}_rescue"
rescue_dir = f"{base_sus_path}/sqanti_rescue/{sample_name}"


In [97]:
# orthogonal data
polyA_motifs = f"{sqanti_path}/data/polyA_motifs/mouse_and_human.polyA_motif.txt"
cage = f"{sqanti_path}/data/ref_TSS_annotation/human.refTSS_v3.1.hg38.bed"
counts = f"{data_dir}/isotools/h1_endo_chr8_all_count.txt"
polyA_peaks = f"{data_dir}/reference/orthogonal/chr8/atlas.clusters.2.0.GRCh38.96_chr8.bed"
splice_junctions = f"{data_dir}/endodermal/illumina_cdna/chr8/ENCFF498FDF_ENCFF181VTPSJ.out_chr8.tab"
sr_bam = f"{data_dir}/endodermal/illumina_cdna/chr8/ENCFF498FDF_ENCFF181VTPAligned.sortedByCoord.out_chr8.bam"

# SQANTI3 Quality Control

## TODO: Motivate SQANTI3 QC
Copy some basic information, graphs, etc. about SQANTI from the docs and tutorials

In [99]:
# How to run SQANTI3 Quality Control?
# see also: https://github.com/ConesaLab/SQANTI3/wiki/Running-SQANTI3-Quality-Control#running
!$sqanti_path/sqanti3_qc.py --help


      ░██████╗░░█████╗░
      ██╔═══██╗██╔══██╗
      ██║██╗██║██║░░╚═╝
      ╚██████╔╝██║░░██╗
      ░╚═██╔═╝░╚█████╔╝
      ░░░╚═╝░░░░╚════╝░
    
usage: sqanti3_qc.py [-h] --isoforms ISOFORMS --refGTF REFGTF --refFasta
                     REFFASTA [--min_ref_len MIN_REF_LEN] [--force_id_ignore]
                     [--fasta] [--genename]
                     [--novel_gene_prefix NOVEL_GENE_PREFIX] [-s SITES]
                     [-w WINDOW]
                     [--aligner_choice {minimap2,deSALT,gmap,uLTRA}]
                     [-x GMAP_INDEX] [--skipORF] [--orf_input ORF_INPUT]
                     [--short_reads SHORT_READS] [--SR_bam SR_BAM]
                     [--CAGE_peak CAGE_PEAK]
                     [--polyA_motif_list POLYA_MOTIF_LIST]
                     [--polyA_peak POLYA_PEAK] [--phyloP_bed PHYLOP_BED]
                     [-e EXPRESSION] [-c COVERAGE] [-fl FL_COUNT]
                     [--isoAnnotLite] [--gff3 GFF3] [-o OUTPUT] [-d DIR]
                     [--s

## Running SQ3 Quality Control

In [100]:
# Build the SQANTI3 QC command
cmd = [
    "/usr/bin/time", "-v",
    f"{sqanti_path}/sqanti3_qc.py",         # SQANTI3 QC script
    "--isoforms", input_gtf,                # GTF to Quality Control
    "--refGTF", ref_gtf,                    # Reference GTF
    "--refFasta", ref_genome,               # Reference Genome
    "--polyA_motif_list", polyA_motifs,     # PolyA Motif List
    "--polyA_peak", polyA_peaks,            # PolyA Peaks
    # "--fl_count", counts,                   # Counts file
    "--coverage", splice_junctions,         # splice junction short-read coverage file (from STAR)
    "--CAGE_peak", cage,                    # CAGE Peak file
    "--SR_bam", sr_bam,                     # Short-read BAM file
    "--output", qc_prefix,                  # Output Prefix
    "--dir", qc_dir,                        # Output Location
    "--skipORF",                            # Skip ORF Prediction (takes longer)
    "--cpus", str(n_cores)                  # Number of Threads
]

# Print the command for reference
print("Running command:")
print(" ".join(cmd))

# Run the command
result = subprocess.run(cmd, capture_output=True, text=True)

# Print output and errors
print("Standard Output:")
print(result.stdout)
print("Standard Error:")
print(result.stderr)

Running command:
/usr/bin/time -v /home/fjetzinger/tools/SQANTI3-5.4/sqanti3_qc.py --isoforms /mnt/c/data/lrgasp/homo_sapiens/isotools/h1_endo_chr8_all.gtf --refGTF /mnt/c/data/lrgasp/homo_sapiens/reference/chr8/gencode.v45.annotation_chr8.gtf --refFasta /mnt/c/data/lrgasp/homo_sapiens/reference/chr8/GRCh38.primary_assembly.genome_chr8.fa --polyA_motif_list /home/fjetzinger/tools/SQANTI3-5.4/data/polyA_motifs/mouse_and_human.polyA_motif.txt --polyA_peak /mnt/c/data/lrgasp/homo_sapiens/reference/orthogonal/chr8/atlas.clusters.2.0.GRCh38.96_chr8.bed --coverage /mnt/c/data/lrgasp/homo_sapiens/endodermal/illumina_cdna/chr8/ENCFF498FDF_ENCFF181VTPSJ.out_chr8.tab --CAGE_peak /home/fjetzinger/tools/SQANTI3-5.4/data/ref_TSS_annotation/human.refTSS_v3.1.hg38.bed --SR_bam /mnt/c/data/lrgasp/homo_sapiens/endodermal/illumina_cdna/chr8/ENCFF498FDF_ENCFF181VTPAligned.sortedByCoord.out_chr8.bam --output h1_endo_chr8_isotools_qc --dir /mnt/c/Users/jetzi/other_repos/summer_school/data/sqanti_qc/h1_endo

## Investigating SQANTI3 QC results

see also: https://github.com/ConesaLab/SQANTI3/wiki/Understanding-the-output-of-SQANTI3-QC

### Can you find answers to the following Questions in the SQANTI3 Quality Control report?

SQANTI3 provides a detailed report with a great variety of information, statistics, and plots that detail the results of the Quality Control process. You can find it under `data/sqanti_qc/h1_endo_chr8_isotools/h1_endo_chr8_isotools_qc_SQANTI3_report.html`

1. **Splice Junction Classification:**
    
    a. How many non-canonical splice junctions exist in the transcriptome? How many of them are known vs. novel?

    b. Which structural categories do you think the majority of transcripts with novel non-canonical splice junctions belong to? 

2. **Exon Structure:**

    a. How many transcripts are multi- vs. mono-exonic? How does this differ between reference and novel transcripts?

    b. Can you think of some possible reasons for novel transcripts to contain more mono-exons than reference transcripts?

3. **Features of Good Quality:**

    a. Examine how the support levels of annotation, canonical splice junctions, splice junction coverage, and CAGE coverage differ across the FSM, ISM, NIC, and NNC structural categories. Then, try to think of possible explanations for what you are observing.

    b. Can you explain the following observations?

    1. ISM transcripts show noticably lower levels of CAGE support.

    2. NNC transcripts show noticably lower levels of Canonical splice junctions as well as splice junction coverage.


### Investigate in Genome Viewer...

In [135]:
igv_browser_qc= igv_notebook.Browser(
    {
        "reference": {
            "id": "hg38",
            "name": "Human (GRCH38/hg38)",
            "fastaPath": "../../data/reference/GRCh38.primary_assembly.genome_chr8.fa",
            "indexPath": "../../data/reference/GRCh38.primary_assembly.genome_chr8.fa.fai"
        },
        "locus": "chr8:75,391,802-75,577,425",
        "tracks": [
            {
                "name": "IsoTools",
                "path": "../../data/sqanti_qc/h1_endo_chr8_isotools/h1_endo_chr8_isotools_qc_corrected.gtf",
                "format": "gtf",
                "type": "annotation",
                "displayMode": "SQUISHED"
            },
            {
            "name": "Reference",
            "path": "../../data/sqanti_qc/gencode.v45.annotation_chr8/gencode.v45.annotation_chr8_qc_corrected.gtf",
            "format": "gtf",
            "type": "annotation",
            "displayMode": "SQUISHED"
            }
        ]
    }
)

<IPython.core.display.Javascript object>

# SQANTI3 Filter

## TODO: Motivate SQANTI3 Filter
Copy some basic information, graphs, etc. about SQANTI from the docs and tutorials

In [None]:
# How to run SQ3 Filter?
# see also: https://github.com/ConesaLab/SQANTI3/wiki/Running-SQANTI3-filter
# specifically for the Machine Learning Filter: https://github.com/ConesaLab/SQANTI3/wiki/Running-SQANTI3-filter#ml
!$sqanti_path/sqanti3_filter.py ml --help


      ███████╗██╗██╗░░░░░████████╗███████╗██████╗░
      ██╔════╝██║██║░░░░░╚══██╔══╝██╔════╝██╔══██╗
      █████╗░░██║██║░░░░░░░░██║░░░█████╗░░██████╔╝
      ██╔══╝░░██║██║░░░░░░░░██║░░░██╔══╝░░██╔══██╗
      ██║░░░░░██║███████╗░░░██║░░░███████╗██║░░██║
      ╚═╝░░░░░╚═╝╚══════╝░░░╚═╝░░░╚══════╝╚═╝░░╚═╝
    
usage: sqanti3_filter.py ml [-h] --sqanti_class SQANTI_CLASS
                            [--isoAnnotGFF3 ISOANNOTGFF3]
                            [--filter_isoforms FILTER_ISOFORMS]
                            [--filter_gtf FILTER_GTF]
                            [--filter_sam FILTER_SAM]
                            [--filter_faa FILTER_FAA] [-o OUTPUT] [-d DIR]
                            [--skip_report] [-e] [-v] [-c CPUS]
                            [-t PERCENT_TRAINING] [-p TP] [-n TN]
                            [-j THRESHOLD] [-f] [--intermediate_files]
                            [-r REMOVE_COLUMNS] [-z MAX_CLASS_SIZE]
                            [-i INTRAPRIMING]

ML fil

## Running SQ3 Filter

### Preprocessing: Choosing True Positive / True Negative Transcript Sets 
While this can be done automatically, we have found the following set of criteria to yield more stringent filtering results.
The SQANTI3 Machine Learning Filter then trains a Random Forest model on the basis of these True Positive / True Negative transcripts, which is then applied to filter the entire transcriptome.

In [105]:
classification_file = os.path.join(qc_dir, f"{qc_prefix}_classification.txt")
filter_sets_dir = os.path.join(filter_dir, "filter_sets")
max_set_size = 3000

# Load classification file
df = pd.read_csv(classification_file, sep='\t', low_memory=False)

# Ensure required columns are present
required_columns = ['structural_category', 'all_canonical', 'within_CAGE_peak', 'within_polyA_site', 'isoform']
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"Required column '{col}' not found in the classification file.")

# Define True Positives (TP)
tp_conditions = (
    (df['structural_category'] == 'full-splice_match') &
    (df['all_canonical'] == "canonical") &
    (df['within_CAGE_peak'] == True) &
    (df['within_polyA_site'] == True) &
    (df['exons'] > 1) 
)
tp_df = df[tp_conditions]

    # Define True Negatives (TN)
tn_conditions = (
    (df['structural_category'] != 'full-splice_match') &
    (
        (df['all_canonical'] == "non_canonical") |
        (df['within_CAGE_peak'] != True) |
        (df['within_polyA_site'] != True)
    ) &
    (df['exons'] > 1)
)
tn_df = df[tn_conditions]

# Sample up to max_size transcripts for each set
tp_sample = tp_df.sample(n=min(len(tp_df), max_set_size), random_state=42)
tn_sample = tn_df.sample(n=min(len(tn_df), max_set_size), random_state=42)

if len(tp_sample) < 250:
    raise ValueError(f"Not enough TP transcripts found. Found {len(tp_sample)}, required 250.")
if len(tn_sample) < 250:
    raise ValueError(f"Not enough TN transcripts found. Found {len(tn_sample)}, required 250.")

# Save isoform IDs to files
os.makedirs(filter_sets_dir, exist_ok=True)
tp_file = os.path.join(filter_sets_dir, "TP_list.txt")
tn_file = os.path.join(filter_sets_dir, "TN_list.txt")
tp_sample['isoform'].to_csv(tp_file, index=False, header=False)
tn_sample['isoform'].to_csv(tn_file, index=False, header=False)

print(f"Saved {len(tp_sample)} TP isoforms to {tp_file}")
print(f"Saved {len(tn_sample)} TN isoforms to {tn_file}")
    
# Exclusion file for variables that should't be useeed in ML filtering
exclusion_file = os.path.join(filter_sets_dir, "exclusion_list.txt")
with open(exclusion_file, 'w') as f:
    f.write("all_canonical\n")
    f.write("within_CAGE_peak\n")
    f.write("within_polyA_site\n")
    f.write("dist_to_CAGE_peak\n")
    f.write("dist_to_polyA_site\n")
print(f"Exclusion list saved to {exclusion_file}")


Saved 1036 TP isoforms to /mnt/c/Users/jetzi/other_repos/summer_school/data/sqanti_filter/h1_endo_chr8_isotools/filter_sets/TP_list.txt
Saved 3000 TN isoforms to /mnt/c/Users/jetzi/other_repos/summer_school/data/sqanti_filter/h1_endo_chr8_isotools/filter_sets/TN_list.txt
Exclusion list saved to /mnt/c/Users/jetzi/other_repos/summer_school/data/sqanti_filter/h1_endo_chr8_isotools/filter_sets/exclusion_list.txt


In [106]:
# Run SQ3 Filter

corrected_fasta = qc_dir + "/" + qc_prefix + "_corrected.fasta"
corrected_gtf = qc_dir + "/" + qc_prefix + "_corrected.gtf"

# Build the command

# Default execution with automatic definition of TP and TN sets:
# TP: all multi-exon Reference Match (RM, subcategory of FSM where TTS/TSS within 50bp); if <250, all FSM
# TN: all multi-exon Novel Not in Catalog (NNC) that have at least one non-canonical junction; if <250, all NNC
cmd_default = [
    "/usr/bin/time", "-v",
    f"{sqanti_path}/sqanti3_filter.py",         # SQANTI3 Filter script
    "ml",                                       # Mode: Machine Learning
    "--sqanti_class", classification_file,      # Classification file
    "--dir", filter_dir,                        # Output Location
    "--filter_isoforms", corrected_fasta,       # Corrected fasta (isoform sequences to filter)
    "--filter_gtf", corrected_gtf,              # GTF to filter
    "--output", filter_prefix,                  # Output prefix
    "--cpus", str(n_cores),                     # Number of cores
]

# Command with manual definition of TP and TN sets
cmd = [
    "/usr/bin/time", "-v",
    f"{sqanti_path}/sqanti3_filter.py",         # SQANTI3 Filter script
    "ml",                                       # Mode: Machine Learning
    "--sqanti_class", classification_file,      # Classification file
    "--dir", filter_dir,                        # Output Location
    "--filter_isoforms", corrected_fasta,       # Corrected fasta (isoform sequences to filter)
    "--filter_gtf", corrected_gtf,              # GTF to filter
    "--threshold", "0.7",                       # Threshold for Machine Learning
    "--TP", tp_file,                            # TP list
    "--TN", tn_file,                            # TN list
    "--remove_columns", exclusion_file,         # Exclusion list
    "--output", filter_prefix,                  # Output prefix
    "--cpus", str(n_cores),                     # Number of cores
]
# Print the command for reference
print("Running command:")
print(" ".join(cmd))

# Run the command
result = subprocess.run(cmd, capture_output=True, text=True)

# Print output and errors
print("Output:")
print(result.stdout)
print("Errors:")
print(result.stderr)

Running command:
/usr/bin/time -v /home/fjetzinger/tools/SQANTI3-5.4/sqanti3_filter.py ml --sqanti_class /mnt/c/Users/jetzi/other_repos/summer_school/data/sqanti_qc/h1_endo_chr8_isotools/h1_endo_chr8_isotools_qc_classification.txt --dir /mnt/c/Users/jetzi/other_repos/summer_school/data/sqanti_filter/h1_endo_chr8_isotools --filter_isoforms /mnt/c/Users/jetzi/other_repos/summer_school/data/sqanti_qc/h1_endo_chr8_isotools/h1_endo_chr8_isotools_qc_corrected.fasta --filter_gtf /mnt/c/Users/jetzi/other_repos/summer_school/data/sqanti_qc/h1_endo_chr8_isotools/h1_endo_chr8_isotools_qc_corrected.gtf --threshold 0.7 --TP /mnt/c/Users/jetzi/other_repos/summer_school/data/sqanti_filter/h1_endo_chr8_isotools/filter_sets/TP_list.txt --TN /mnt/c/Users/jetzi/other_repos/summer_school/data/sqanti_filter/h1_endo_chr8_isotools/filter_sets/TN_list.txt --remove_columns /mnt/c/Users/jetzi/other_repos/summer_school/data/sqanti_filter/h1_endo_chr8_isotools/filter_sets/exclusion_list.txt --output h1_endo_chr8_

## Investigating SQANTI3 Filter Results

### Can you find answers to the following Questions in the SQANTI3 Filter report?

Similarly to the report of the Quality Control process, SQANTI3 also provides a report for the Filter process. You can find it under `data/sqanti_filter/h1_endo_chr8_isotools/h1_endo_chr8_isotools_filter_SQANTI3_filter_report.html`

1. **Q1:**
    
    a. Q1a?

    b. Q1b? 

2. **Q2:**

    a. Q2a?

    b. Q2b?

3. **Q3:**

    a. Q3a

    b. Q3b



In [134]:
igv_browser_filter= igv_notebook.Browser(
    {
        "reference": {
            "id": "hg38",
            "name": "Human (GRCH38/hg38)",
            "fastaPath": "../../data/reference/GRCh38.primary_assembly.genome_chr8.fa",
            "indexPath": "../../data/reference/GRCh38.primary_assembly.genome_chr8.fa.fai"
        },
        "locus": "chr8:75,391,802-75,577,425",
        "tracks": [
            {
                "name": "IsoTools (after filter)",
                "path": "../../data/sqanti_filter/h1_endo_chr8_isotools/h1_endo_chr8_isotools_filter.filtered.gtf",
                "format": "gtf",
                "type": "annotation",
                "displayMode": "SQUISHED"
            },
            {
            "name": "Reference",
            "path": "../../data/sqanti_qc/gencode.v45.annotation_chr8/gencode.v45.annotation_chr8_qc_corrected.gtf",
            "format": "gtf",
            "type": "annotation",
            "displayMode": "SQUISHED"
            }
        ]
    }
)

<IPython.core.display.Javascript object>

# SQANTI3 Rescue

## TODO: Motivate SQANTI3 Rescue
Copy some basic information, graphs, etc. about SQANTI from the docs and tutorials

In [130]:
# How to run SQ3 Rescue?
# see also: https://github.com/ConesaLab/SQANTI3/wiki/Running-SQANTI3-rescue
!$sqanti_path/sqanti3_rescue.py ml --help


      ██████╗░███████╗░██████╗░█████╗░██╗░░░██╗███████╗
      ██╔══██╗██╔════╝██╔════╝██╔══██╗██║░░░██║██╔════╝
      ██████╔╝█████╗░░╚█████╗░██║░░╚═╝██║░░░██║█████╗░░
      ██╔══██╗██╔══╝░░░╚═══██╗██║░░██╗██║░░░██║██╔══╝░░
      ██║░░██║███████╗██████╔╝╚█████╔╝╚██████╔╝███████╗
      ╚═╝░░╚═╝╚══════╝╚═════╝░░╚════╝░░╚═════╝░╚══════╝
    
INFO:art_logger:
      ██████╗░███████╗░██████╗░█████╗░██╗░░░██╗███████╗
      ██╔══██╗██╔════╝██╔════╝██╔══██╗██║░░░██║██╔════╝
      ██████╔╝█████╗░░╚█████╗░██║░░╚═╝██║░░░██║█████╗░░
      ██╔══██╗██╔══╝░░░╚═══██╗██║░░██╗██║░░░██║██╔══╝░░
      ██║░░██║███████╗██████╔╝╚█████╔╝╚██████╔╝███████╗
      ╚═╝░░╚═╝╚══════╝╚═════╝░░╚════╝░░╚═════╝░╚══════╝
    
usage: sqanti3_rescue.py ml [-h] --filter_class FILTER_CLASS --refGTF REFGTF
                            --refFasta REFFASTA
                            [--rescue_isoforms RESCUE_ISOFORMS]
                            [--rescue_gtf RESCUE_GTF] [-k REFCLASSIF]
                            [--counts COU

## Running SQANTI3 QC to classify the reference transcriptome

In [None]:
# %%script echo Skipping SQANTI3 QC on reference
# Run SQ3 QC on reference with the same orthogonal data
cmd = [
    "/usr/bin/time", "-v",
    f"{sqanti_path}/sqanti3_qc.py",         # SQANTI3 QC script
    "--isoforms", ref_gtf,                  # GTF to Quality Control
    "--refGTF", ref_gtf,                    # Reference GTF
    "--refFasta", ref_genome,               # Reference Genome
    "--polyA_motif_list", polyA_motifs,     # PolyA Motif List
    "--polyA_peak", polyA_peaks,            # PolyA Peaks
    # "--fl_count", counts,                 # Counts file
    "--coverage", splice_junctions,         # SJ file
    "--CAGE_peak", cage,                    # CAGE Peak file
    "--SR_bam", sr_bam,                     # SR BAM file
    "--output", ref_qc_prefix,              # Output Prefix
    "--dir", ref_qc_dir,                    # Output Location
    "--skipORF",                            # Skip ORF Prediction (takes longer)
    "--cpus", str(n_cores)                  # Number of Threads
]

# Print the command for reference
print("Running command:")
print(" ".join(cmd))

# Run the command
result = subprocess.run(cmd, capture_output=True, text=True)

# Print output and errors
print("Output:")
print(result.stdout)
print("Errors:")
print(result.stderr)

ref_classification = os.path.join(ref_qc_dir, f"{ref_qc_prefix}_classification.txt")

Running command:
/usr/bin/time -v /home/fjetzinger/tools/SQANTI3-5.4/sqanti3_qc.py --isoforms /mnt/c/data/lrgasp/homo_sapiens/reference/chr8/gencode.v45.annotation_chr8.gtf --refGTF /mnt/c/data/lrgasp/homo_sapiens/reference/chr8/gencode.v45.annotation_chr8.gtf --refFasta /mnt/c/data/lrgasp/homo_sapiens/reference/chr8/GRCh38.primary_assembly.genome_chr8.fa --polyA_motif_list /home/fjetzinger/tools/SQANTI3-5.4/data/polyA_motifs/mouse_and_human.polyA_motif.txt --polyA_peak /mnt/c/data/lrgasp/homo_sapiens/reference/orthogonal/chr8/atlas.clusters.2.0.GRCh38.96_chr8.bed --coverage /mnt/c/data/lrgasp/homo_sapiens/endodermal/illumina_cdna/chr8/ENCFF498FDF_ENCFF181VTPSJ.out_chr8.tab --CAGE_peak /home/fjetzinger/tools/SQANTI3-5.4/data/ref_TSS_annotation/human.refTSS_v3.1.hg38.bed --SR_bam /mnt/c/data/lrgasp/homo_sapiens/endodermal/illumina_cdna/chr8/ENCFF498FDF_ENCFF181VTPAligned.sortedByCoord.out_chr8.bam --output gencode.v45.annotation_chr8_qc --dir /mnt/c/Users/jetzi/other_repos/summer_school

Output:

Errors:

      ░██████╗░░█████╗░
      ██╔═══██╗██╔══██╗
      ██║██╗██║██║░░╚═╝
      ╚██████╔╝██║░░██╗
      ░╚═██╔═╝░╚█████╔╝
      ░░░╚═╝░░░░╚════╝░
    
[INFO:2025-05-14 15:32:58,154] Write arguments to /mnt/c/Users/jetzi/other_repos/summer_school/data/sqanti_qc/gencode.v45.annotation_chr8/gencode.v45.annotation_chr8_qc.qc_params.txt...
[INFO:2025-05-14 15:32:58,225] Initialising QC pipeline.
[INFO:2025-05-14 15:32:58,226] Parsing provided files
[INFO:2025-05-14 15:32:58,259] Reading genome fasta /mnt/c/data/lrgasp/homo_sapiens/reference/chr8/GRCh38.primary_assembly.genome_chr8.fa
[INFO:2025-05-14 15:33:08,794] **** Correcting sequences
[INFO:2025-05-14 15:33:08,795] Correcting fasta
[INFO:2025-05-14 15:33:08,796] Skipping aligning of sequences because GTF file was provided.
[INFO:2025-05-14 15:33:15,329] Indels will be not calculated since you ran SQANTI3 without alignment step (SQANTI3 with gtf format as transcriptome input).
[INFO:2025-05-14 15:33:31,637] **** Predicti

## Running SQANTI3 Rescue

In [113]:
filter_classification = os.path.join(filter_dir, f"{filter_prefix}_ML_result_classification.txt")
filtered_fasta = os.path.join(filter_dir, f"{filter_prefix}.filtered.fasta")
filtered_gtf = os.path.join(filter_dir, f"{filter_prefix}.filtered.gtf")
random_forest_model = os.path.join(filter_dir, f"{filter_prefix}_randomforest.RData")

# Run SQ3 Rescue
cmd = [
    "/usr/bin/time", "-v",
    f"{sqanti_path}/sqanti3_rescue.py",         # SQANTI3 Rescue script
    "ml",                                       # Filter Mode: Machine Learning
    "--rescue_isoforms", filtered_fasta,        # Filtered fasta
    "--rescue_gtf", filtered_gtf,               # Filtered GTF
    "--filter_class", filter_classification,    # Filter Classification
    "--refGTF", ref_gtf,                        # Reference GTF
    "--refFasta", ref_genome,                   # Reference Genome
    "--refClassif", ref_classification,         # Reference Classification
    # "--requant",                              # Requantify
    # "--counts", counts,                       # Counts file
    "--random_forest", random_forest_model,     # Random Forest Model from Filter
    "--threshold", "0.7",                       # Threshold for Machine Learning
    "--mode", "full",                           # Rescue Mode: Full (extend rescue to non-FSM isoforms)
    "--output", rescue_prefix,                  # Output Prefix
    "--dir", rescue_dir,                        # Output Location
    "--cpus", str(n_cores)                      # Number of Threads
]

# Print the command for reference
print("Running command:")
print(" ".join(cmd))

# Run the command
result = subprocess.run(cmd, capture_output=True, text=True)

# Print output and errors
print("Output:")
print(result.stdout)
print("Errors:")
print(result.stderr)

Running command:
/usr/bin/time -v /home/fjetzinger/tools/SQANTI3-5.4/sqanti3_rescue.py ml --rescue_isoforms /mnt/c/Users/jetzi/other_repos/summer_school/data/sqanti_filter/h1_endo_chr8_isotools/h1_endo_chr8_isotools_filter.filtered.fasta --rescue_gtf /mnt/c/Users/jetzi/other_repos/summer_school/data/sqanti_filter/h1_endo_chr8_isotools/h1_endo_chr8_isotools_filter.filtered.gtf --filter_class /mnt/c/Users/jetzi/other_repos/summer_school/data/sqanti_filter/h1_endo_chr8_isotools/h1_endo_chr8_isotools_filter_ML_result_classification.txt --refGTF /mnt/c/data/lrgasp/homo_sapiens/reference/chr8/gencode.v45.annotation_chr8.gtf --refFasta /mnt/c/data/lrgasp/homo_sapiens/reference/chr8/GRCh38.primary_assembly.genome_chr8.fa --refClassif /mnt/c/Users/jetzi/other_repos/summer_school/data/sqanti_qc/gencode.v45.annotation_chr8/gencode.v45.annotation_chr8_qc_classification.txt --random_forest /mnt/c/Users/jetzi/other_repos/summer_school/data/sqanti_filter/h1_endo_chr8_isotools/h1_endo_chr8_isotools_fi

## Investigating SQANTI3 Rescue Results

### Can you find answers to the following Questions in the SQANTI3 Rescue results?

While SQANTI3 Rescue does not provide a report, we can explore the results to find answers to the following questions.

1. **Q1:**
    
    a. Q1a?

    b. Q1b? 

2. **Q2:**

    a. Q2a?

    b. Q2b?

3. **Q3:**

    a. Q3a

    b. Q3b



In [139]:
igv_browser_rescue= igv_notebook.Browser(
    {
        "reference": {
            "id": "hg38",
            "name": "Human (GRCH38/hg38)",
            "fastaPath": "../../data/reference/GRCh38.primary_assembly.genome_chr8.fa",
            "indexPath": "../../data/reference/GRCh38.primary_assembly.genome_chr8.fa.fai"
        },
        "locus": "chr8:28701580-28753690",
        "tracks": [
            {
                "name": "IsoTools (after rescue)",
                "path": "../../data/sqanti_rescue/h1_endo_chr8_isotools/h1_endo_chr8_isotools_rescue_rescued.gtf",
                "format": "gtf",
                "type": "annotation",
                "displayMode": "SQUISHED"
            },
            {
            "name": "Reference",
            "path": "../../data/sqanti_qc/gencode.v45.annotation_chr8/gencode.v45.annotation_chr8_qc_corrected.gtf",
            "format": "gtf",
            "type": "annotation",
            "displayMode": "SQUISHED"
            }
        ]
    }
)

<IPython.core.display.Javascript object>

# Where do we go from here?

After using SQANTI3 to perform Quality Control on a custom transcriptome, Filter out spurious transcripts, and Rescue related reference transcripts, further downstream analyses can be conducted.

Some examples of further analyses:

* Differential Gene Expression; Differential Transcript Expression/Usage; Differential Exon Expression/Usage

* Functional Annotation, Enrichment Analysis, Pathway Analysis