# Data

In [1]:
reference_assembly = '../data/NC_059190.1_ref.fna'
reference_annotaton = '../data/NC_059190.1_RefSeq.gff' 
# nanopore_reads 
geneid_parameters = "../data/Daphnia_magna.param"
isoquant_models = "../data/NC_059190.1_isquant_models.gff"

# Predict genes with geneid

In [2]:
! time geneid -3P $geneid_parameters $reference_assembly > ../results/geneid_simple.gff3

geneid -3P ../data/Daphnia_magna.param ../data/NC_059190.1_ref.fna >   7.46s user 0.22s system 99% cpu 7.756 total


#### Check simple stats

In [3]:
! grep -v '#' ../results/geneid_simple.gff3 | cut -f 3 | sort | uniq -c 

13296 CDS
13296 exon
2914 gene
2914 mRNA


In [4]:
! bash ../scripts/extract_features.sh 

Usage: ../scripts/extract_features.sh <gff_file> <fasta_file/genome_size_in_bp>


In [5]:
! bash ../scripts/extract_features.sh ../results/geneid_simple.gff3 $reference_assembly

Feature	Features_Count	Total_Feature_Length	Average_Feature_Length	Genome_Percentage
mRNA	2914	7203206	2471.93	69.98
exon	13296	3463047	260.46	33.65
CDS	13296	3463047	260.46	33.65
gene	2914	7203206	2471.93	69.98


In [26]:
13296/2914

4.562800274536719

# Build Transcitps models with IsoQuant

#### The Isoquant Output

#### Check simple stats

In [6]:
! grep -v '#' $isoquant_models  | cut -f 3 | sort | uniq -c 

5718 exon
 797 gene
1349 transcript


In [10]:
! bash ../scripts/extract_features.sh $isoquant_models $reference_assembly

Feature	Features_Count	Total_Feature_Length	Average_Feature_Length	Genome_Percentage
exon	5718	1185519	207.33	11.52
transcript	1349	5689555	4217.61	55.28
gene	797	3950119	4956.23	38.38


# Inform geneid with IsoQuant models 

### Prepare models

#### Get longest transcript (check --no_check)

In [33]:
! agat_sp_keep_longest_isoform.pl --gff $isoquant_models -o ../intermediate/isoquant_longest.gff

docker: Error response from daemon: invalid volume specification: '/host_mnt/Users/fzanarello/work/projects/summer_school/practicals/day2/fabio/notebook:~': invalid mount config for type "bind": invalid mount path: '~' mount path must be absolute.
See 'docker run --help'.


In [44]:
! bash ../scripts/extract_features.sh ../intermediate/isoquant_longest.gff $reference_assembly

Feature	Features_Count	Total_Feature_Length	Average_Feature_Length	Genome_Percentage
exon	3546	783245	220.88	7.61
transcript	797	3294312	4133.39	32.01
gene	797	3950119	4956.23	38.38


#### Get exons

In [42]:
! awk '$3 == "exon"' ../intermediate/isoquant_longest.gff > ../intermediate/isoquant_exons.gff

In [43]:
! gawk 'BEGIN { FS=OFS="\t" } /^#/ { print; next } { match($9, /transcript_id=([^;]+)/, a); $9 = a[1]; print }' ../intermediate/isoquant_exons.gff > ../intermediate/isoquant_exons_geneid.gff 

#### Label with geneid categories

In [64]:
from collections import defaultdict
import csv

# Input/output files
input_file = "../intermediate/isoquant_exons_geneid.gff"
output_file = "../intermediate/isoquant_exons_geneid_class.gff"

# Group exons by transcript
transcript_exons = defaultdict(list)

with open(input_file) as f:
    for line in f:
        if line.startswith("#") or not line.strip():
            continue
        parts = line.strip().split('\t')
        transcript_id = parts[8].strip()  # Assuming 9th column is just the transcript name
        transcript_exons[transcript_id].append(parts)

# Process and label exons
with open(output_file, 'w', newline='') as out:
    writer = csv.writer(out, delimiter='\t', quoting=csv.QUOTE_NONE, escapechar='\\')

    for transcript, exons in transcript_exons.items():

        strand = exons[0][6]
        
        for i, exon in enumerate(exons):
            if i == 0 :
                if strand == '+':
                    exon[2] = "First"
                else:
                    exon[2] = "Terminal"                 
            elif i == len(exons) - 1:
                if strand == '+':
                    exon[2] = "Terminal"
                else:
                    exon[2] = "First"
            else:
                exon[2] = "Internal"
            writer.writerow(exon)

In [71]:
! grep '+' ../intermediate/isoquant_exons_geneid_class.gff > ../intermediate/isoquant_exons_geneid_class_FW.gff

In [72]:
! grep '' ../intermediate/isoquant_exons_geneid_class.gff > ../intermediate/isoquant_exons_geneid_class_RW.gff

### Predict with geneid and -R parameter

In [73]:
! time geneid -3P $geneid_parameters -R ../intermediate/isoquant_exons_geneid_class_FW.gff $reference_assembly > ../results/geneid_fw.gff3

Error: Order violation: annotations (starting position 149468):
-->NC_059190.1	IsoQuant	First	148639	148954	.	+	.	transcript8984.NC_059190.1.nnic


geneid -3P ../data/Daphnia_magna.param -R  ../data/NC_059190.1_ref.fna >   0.06s user 0.08s system 91% cpu 0.155 total


#### Check simple stats

# Run full comparison with compann-nf

### Get reference annotation

#### Get info about the taxon

#### Get annotation for the species of iterest

#### Prepare input

### Run Pipeline

### Inspect results