# Overview

In [3]:
import pandas as pd

platys_data = pd.read_csv('../data/species_Platyhelminthes--___w_labels.csv')

In [4]:
platys_data.head()

Unnamed: 0,Species Name,Provider,Assembly,BioProject ID,Clade,Genome Browser,N50,Genome Size,Number of Scaffolds,Number of Coding Genes,label
0,Atriophallophorus winterbourni,Swiss Federal Institute of Technology in Zurich,ASM1340708v1,PRJNA636673,Trematoda (Flukes),JBrowse | Ensembl,39978,601728533,26114,11499,atriophallophorus_winterbourni_PRJNA636673
1,Clonorchis sinensis,Sun Yat-sen University,C_sinensis-2.0,PRJDA72781,Trematoda (Flukes),JBrowse | Ensembl,415842,547288241,4348,13634,clonorchis_sinensis_PRJDA72781
2,Clonorchis sinensis,The University of Melbourne,CSKR.v2,PRJNA386618,Trematoda (Flukes),JBrowse | Ensembl,168711085,558124894,78,13489,clonorchis_sinensis_PRJNA386618
3,Dibothriocephalus latus,Wellcome Sanger Institute,D_latum_Geneva_0011_upd,PRJEB1206,Cestoda (Tapeworms),JBrowse | Ensembl,6726,531434409,140294,19966,dibothriocephalus_latus_PRJEB1206
4,Dicrocoelium dendriticum,Wellcome Sanger Institute,tdDicDend1.1,PRJEB44434,Trematoda (Flukes),JBrowse | Ensembl,117106881,1889995958,19456,13685,dicrocoelium_dendriticum_PRJEB44434


# Renaming set of proteins

In [None]:
# create species to code dictionary
species2code = {species: species.split(' ')[0][0]+species.split(' ')[1][0:3].lower() for species in platys_data['Species Name'].to_list()} # create a dictionary species-code
# load all proteins into a dictionary (label as key, Biopython SeqRecord as object)
# rename all proteins using the codes in species2code dic
# each protein is named as <code>.<num>, increasing number from 1 to n (n = proteome size)

In [16]:
import glob, gzip
from Bio import SeqIO

# create species to code dictionary
species2code = {species: species.split(' ')[0][0]+species.split(' ')[1][0:3].lower() for species in platys_data['Species Name'].to_list()}
# Create an empty list to store original and new protein names
correlation_data = []
# Load all proteins into a dictionary
protein_dict = {}

protein_files = glob.glob('../data/platyhelminthes_dataset_vfinal/*.fa.gz')
for file in protein_files:
    species_name = file.rpartition('/')[2].split('.')[0].split('_')[0].title() + ' ' + file.rpartition('/')[2].split('.')[0].split('_')[1]
    code = species2code.get(species_name)
    if code:
        with gzip.open(file, 'rt') as f:
            records = SeqIO.parse(f, "fasta")
            for i, record in enumerate(records, start=1):
                original_name = record.id
                new_name = f"{code}.{i}"
                correlation_data.append({'Original Name': original_name, 'New Name': new_name})
                record.id = new_name
                record.name = new_name
                record.description = new_name
                protein_dict[new_name] = record


In [19]:
# creating a correlation table to save
original2new = pd.DataFrame.from_dict(correlation_data)

# Filtering by isoform length

In [36]:
import glob, tqdm
import pandas as pd
from BCBio import GFF

# Create a dictionary to store the longest isoforms for each protein
longest_isoforms = {}

# Iterate over GFF3 files
gff3_files = glob.glob('../data/platyhelminthes_dataset_vfinal/*gff3*')

for file in tqdm.tqdm(gff3_files):
    try:
        with gzip.open(file, 'rt', encoding='latin1') as f:  # Open with 'latin1' encoding
            for rec in GFF.parse(f):
                for feature in rec.features:
                    if 'mRNA' in feature.type:
                        attributes = feature.qualifiers
                        protein_id = attributes.get('protein_id')
                        if protein_id:
                            # Convert protein_id to new code
                            species_name = attributes.get('species')
                            species_code = species2code.get(species_name)
                            new_protein_id = f"{species_code}.{protein_id}"
                            
                            start = feature.location.start.position
                            end = feature.location.end.position
                            length = end - start + 1
                            if new_protein_id not in longest_isoforms or length > longest_isoforms[new_protein_id]['length']:
                                longest_isoforms[new_protein_id] = {'start': start, 'end': end, 'length': length}
    except EOFError as e:
        print(f"Error reading file {file}: {e}")

print('Done with selecting isoforms!')

# Create a DataFrame for original to new protein IDs correlation
original2new = pd.DataFrame.from_dict(correlation_data)

# Create a dictionary to store the selected isoforms
selected_isoforms = {}

# Iterate over longest isoforms and get the corresponding protein names
for protein_id, info in longest_isoforms.items():
    original_protein_id = original2new.loc[original2new['New Code'] == protein_id.split('.')[0], 'Original Code'].values[0]
    selected_isoforms[original_protein_id] = protein_id

# Now 'selected_isoforms' dictionary contains the selected isoforms for each protein name.


  4%|████▌                                                                                                         | 1/24 [00:17<06:53, 17.96s/it]

Error reading file ../data/platyhelminthes_dataset_vfinal/clonorchis_sinensis.PRJDA72781.WBPS18.annotations.gff3.gz: Compressed file ended before the end-of-stream marker was reached


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [16:33<00:00, 41.40s/it]


Done with selecting isoforms!


In [40]:
# Create a dictionary to store the longest isoforms for each protein
longest_isoforms = {}

# Iterate over GFF3 files
gff3_files = glob.glob('../data/platyhelminthes_dataset_vfinal/*gff3*')

for file in tqdm.tqdm(gff3_files):
    try:
        with gzip.open(file, 'rt', encoding='latin1') as f:  # Open with 'latin1' encoding
            for rec in GFF.parse(f):
                for feature in rec.features:

{}

In [1]:
import rpy2

https://drostlab.github.io/orthologr/reference/retrieve_longest_isoforms.html

In [2]:
%load_ext rpy2.ipython

In [4]:
%%R 

# Install Bioconductor
library(BiocManager)

# Install package dependencies
BiocManager::install(c(
        "Biostrings",
        "GenomicRanges",
        "GenomicFeatures",
        "Rsamtools",
        "rtracklayer"
))

# install CRAN dependencies
install.packages(c("doParallel", "foreach", "ape", "Rdpack", "benchmarkme", "devtools"))

# install BLAST dependency metablastr from GitHub
devtools::install_github("drostlab/metablastr")

# install DIAMOND dependency rdiamond from GitHub
devtools::install_github("drostlab/rdiamond")

# install orthologr from GitHub
devtools::install_github("drostlab/orthologr")

R[write to console]: Error in library(BiocManager) : there is no package called ‘BiocManager’




Error in library(BiocManager) : there is no package called ‘BiocManager’


RInterpreterError: Failed to parse and evaluate line '\n# Install Bioconductor\nlibrary(BiocManager)\n\n# Install package dependencies\nBiocManager::install(c(\n        "Biostrings",\n        "GenomicRanges",\n        "GenomicFeatures",\n        "Rsamtools",\n        "rtracklayer"\n))\n\n# install CRAN dependencies\ninstall.packages(c("doParallel", "foreach", "ape", "Rdpack", "benchmarkme", "devtools"))\n\n# install BLAST dependency metablastr from GitHub\ndevtools::install_github("drostlab/metablastr")\n\n# install DIAMOND dependency rdiamond from GitHub\ndevtools::install_github("drostlab/rdiamond")\n\n# install orthologr from GitHub\ndevtools::install_github("drostlab/orthologr")\n'.
R error message: 'Error in library(BiocManager) : there is no package called ‘BiocManager’'

# Homologues group inference
Criteria: -