In [3]:
!pip install scikit-allel

import allel
import pandas as pd

gff_fn = '/lustre/scratch124/gsu/legacy/pfalciparum/resources/snpEff/data/Pfalciparum_GeneDB_Feb2020/Pfalciparum_replace_Pf3D7_MIT_v3_with_Pf_M76611.gff'
df_gff = allel.gff3_to_dataframe(gff_fn, attributes=['ID', 'Name', "Parent"])     

Successfully installed cloudpickle-3.0.0 dask-2024.6.2 fsspec-2024.6.0 locket-1.0.0 partd-1.4.2 pyyaml-6.0.1 scikit-allel-1.3.8


In [86]:
grouped_df = (
    df_gff.loc[
        (df_gff.Parent != ".") &
        (df_gff.Parent.apply(lambda x: "." in x)) # for some reason `str.contains` doesn't work
    ]
    .reset_index(drop = True)
    .groupby("Parent")
    .apply(
        lambda s: pd.Series({
            "parent_name": tuple(s.Parent)[0],
            "gene_family": tuple(s.Parent)[0].split(".")[0],
            "exons"      : tuple(s.ID),
        })
    )
    .reset_index(drop = True)
)

isoforms_df = (
    grouped_df.loc[
        grouped_df.exons.apply(lambda x: len(x) != 1)
    ]
    .groupby("gene_family")
    .apply(
        lambda s: pd.Series({
            "parent_name": tuple(s.parent_name),
            "exons"      : tuple(s.exons),
        })
    )
    .reset_index(drop = True)
)

isoforms_df

Unnamed: 0,parent_name,exons
0,"(PF3D7_0100100.1,)","((PF3D7_0100100.1:exon:1, PF3D7_0100100.1:exon..."
1,"(PF3D7_0100200.1,)","((PF3D7_0100200.1:exon:1, PF3D7_0100200.1:moti..."
2,"(PF3D7_0100300.1,)","((PF3D7_0100300.1:exon:1, PF3D7_0100300.1:exon..."
3,"(PF3D7_0100400.1,)","((PF3D7_0100400.1:exon:1, PF3D7_0100400.1:exon..."
4,"(PF3D7_0100600.1,)","((PF3D7_0100600.1:exon:1, PF3D7_0100600.1:exon..."
...,...,...
5113,"(PF3D7_1479700.1,)","((PF3D7_1479700.1:exon:1, PF3D7_1479700.1:exon..."
5114,"(PF3D7_1479800.1,)","((PF3D7_1479800.1:exon:1, PF3D7_1479800.1:exon..."
5115,"(PF3D7_1479900.1,)","((PF3D7_1479900.1:exon:1, PF3D7_1479900.1:exon..."
5116,"(PF3D7_1480000.1,)","((PF3D7_1480000.1:exon:1, PF3D7_1480000.1:exon..."


#### Did some pandas manipulation. Grouping isoforms of genes together and collecting their respective exons into lists. 

In [79]:
n_total_genes = 0
n_genes_first_isoform_is_longest = 0
l_genes_first_isoform_is_not_longest = []

for i, row in (
    isoforms_df.loc[
        isoforms_df.parent_name.apply(lambda x: len(x) != 1)
    ]
    .reset_index(drop = True)
).iterrows():
    print(row.parent_name)
    print(row.exons)
    print([len(isoform) for isoform in row.exons])

    n_total_genes += 1

    if len(row.exons[0]) == max([len(isoform) for isoform in row.exons]):
        n_genes_first_isoform_is_longest += 1
    else:
        l_genes_first_isoform_is_not_longest.append(row.parent_name[0].split(".")[0])

print("=" * 20)
print(f"In the {n_total_genes} genes with multiple isoforms, the first isoform is the longest for only {n_genes_first_isoform_is_longest} genes.")

('PF3D7_0105400.1', 'PF3D7_0105400.2')
(('PF3D7_0105400.1:3UTR', 'PF3D7_0105400.1:exon:1', 'PF3D7_0105400.1:exon:2', 'PF3D7_0105400.1:5UTR'), ('PF3D7_0105400.2:exon:2', 'PF3D7_0105400.2:exon:1'))
[4, 2]
('PF3D7_0108400.1', 'PF3D7_0108400.2')
(('PF3D7_0108400.1:3UTR', 'PF3D7_0108400.1:exon:1', 'PF3D7_0108400.1:exon:2', 'PF3D7_0108400.1:exon:3', 'PF3D7_0108400.1:exon:4', 'PF3D7_0108400.1:exon:5', 'PF3D7_0108400.1:exon:6', 'PF3D7_0108400.1:exon:7', 'PF3D7_0108400.1:exon:8', 'PF3D7_0108400.1:exon:9', 'PF3D7_0108400.1:5UTR'), ('PF3D7_0108400.2:exon:6', 'PF3D7_0108400.2:exon:5', 'PF3D7_0108400.2:exon:3', 'PF3D7_0108400.2:exon:2', 'PF3D7_0108400.2:exon:1'))
[11, 5]
('PF3D7_0202600.1', 'PF3D7_0202600.2')
(('PF3D7_0202600.1:3UTR', 'PF3D7_0202600.1:exon:1', 'PF3D7_0202600.1:exon:2', 'PF3D7_0202600.1:exon:3', 'PF3D7_0202600.1:5UTR'), ('PF3D7_0202600.2:3UTR', 'PF3D7_0202600.2:exon:3', 'PF3D7_0202600.2:exon:2', 'PF3D7_0202600.2:exon:1', 'PF3D7_0202600.2:5UTR'))
[5, 5]
('PF3D7_0205700.1', 'PF3D7_020

In [85]:
import json

with open('../../app/files/core_genes.json', 'r') as file:
    core_genes_json = json.load(file)

core_genes_list = core_genes_json.keys()

for gene in l_genes_first_isoform_is_not_longest:
    print(gene, gene in core_genes_list)

PF3D7_0205700 True
PF3D7_0206900 True
PF3D7_0316300 True
PF3D7_1136500 True


I have also manually checked the above genes are in Pf-HaploAtlas: PF3D7_0205700, PF3D7_0206900, PF3D7_0316300, PF3D7_1136500

They are. Therefore, we cannot claim that, by using the .1 isoform, we are using the longest length isoforms. 