In [8]:
import json
import pandas as pd


## TODO
* Make `add_rna_seq_data` and `create_or_update` more efficient
* Add arabidopsis orthologues to DB
  * Get from accession numbers (NIH and )
* Connect front end to db for RNA-seq
  * search by:
    * At gene
    * [X] Xe gene
    * [X] GO terms
  * Select experiment
* add species.id to experiments model so that i can filter by exp for specific species

* Fix up `gene_query_page.py`
  * move the functions to the database
* Maybe clear the textbox on expression_page when user changes something in sidebar and add text in box physically rather than placeholder
* display genes for different GO terms in different panels?
  
## Done
* Show which queried Xe genes arent in DB
* plot expression values for Xerophyta genes
  * log or normalised
* Query expression data by experiment in DB (currently searches all of expression table)
* Fix button stopping showing raw data
* add DEGs table and DEG data
  * filter expression data by DEGs
  
## Questions
- How should i handle the clusters because they aren't consistent?
- I need the humilis fasta file from nicci
- Cant add schlecteri seedlings RNA seek data because idk what half the columns are
- One of mamosas files (Xs) has gene counts all of 0
- what are mamosas files
- Michaels adult leaf tissues dont have any info in the columns
- Does it make sense that X humilis has 99000 genes and other two have 28000

## 01-01-2025 Add RNAseq data to DB
Need to add the X. elegans RNA-seq data to the db. 

Steps:
1. Add tables to `models.py`  and add info to DB schema with Alembic  
2. Format data into long data format  

### 1. Add table
**Set up alembic** 

`alembic init alembic`  
... chatGPT for other steps  

**Create a migration script**   

`alembic revision --autogenerate -m "Your migration description here"`  
fill in script
`alembic upgrade head`  


Added two tables: `Gene_expressions` and `Experiments`

### 2. Format the data

Use the `data_tidier.py` script and the below python cells



In [None]:
import data_tidier as dt
import pandas as pd

# Load the RNA expression file
file_path = "all_data/Michael_RNAseq/Xs seedlings/Xs_seedlings_DESeq2 normalised counts table.csv"
df = pd.read_csv(file_path)

# Transform the data to long formathead 
long_df = dt.transform_to_long(df)

long_df.to_csv(f"{file_path}_tidy", index=False)
print(long_df.head())




In [None]:
df = dt.add_log2(long_df) # add log2 transformation
df = dt.format_time_points(df) # format timepoint
df.rename(columns={"expression": "normalised_expression"}, inplace=True)
print(df.head())

df.to_csv("all_data/Michael_RNAseq/Xe_seedlings (updated)/Xe_seedlings_DESeq2_normalised_counts_table_tidy_for_db.csv", index=False)

### 3. Add to database
Added X. elegans seedling time course data.  



In [None]:
import db_manager as dbm


species_id = "X. elegans"
experiment_name = "xe_seedlings_time_course"
rna_seq_data = pd.read_csv("all_data/Michael_RNAseq/Xe_seedlings (updated)/Xe_seedlings_DESeq2_normalised_counts_table_tidy_for_db.csv")

dbm.add_experiment(experiment_name, "time course of X. elegans seedlings")
dbm.add_rna_seq_data(rna_seq_data, species_id, experiment_name )


## 02.01.2025 Connect frontend to DB

In [None]:
import db_manager as dbm
import db

species_id = "X. elegans"
experiment_name = "xe_seedlings_time_course"
database = db.DB()

database.link_experiment_to_species(experiment_name, species_id)

## 5-01-2025 Add DEG table

- added DEG tables and data for X elegans seedlings.
- can filter by DEGs now
- Can also query expression data based on GO terms

#### Testing DEG implementation
"12","Xele.ptg000001l.15","ReT48","Up-regulated","None","None"
"46","Xele.ptg000001l.50","None","None","DeT09","Down-regulated"
"137","Xele.ptg000001l.159","ReT04","Up-regulated","DeT12","Down-regulated"

Xele.ptg000001l.15
Xele.ptg000001l.50
Xele.ptg000001l.159

nice example: nuclear ubiquitin ligase complex
  - 12 genes
  - 6 degs
  - 3 up 
  - 3 down

In [None]:
import db 

database = db.DB()

gene = "Xsch.v2.MJHO01000001.1.1119"
genes =database.get_genes_by_go_term_or_description(["jasmonic acid and ethylene-dependent systemic resistance"], "X. elegans")
gene_names = [gene.gene_name for gene in genes]

print(len(gene_names))

data = database.get_gene_expression_data(gene_names, "xe_seedlings_time_course")
print(data)

In [None]:
import models
from sqlalchemy import func

def get_go_terms_with_fewest_genes(threshold=0):
    """
    Query GO terms linked to the fewest genes in ascending order of gene count.

    Parameters:
        session: SQLAlchemy session object.

    Returns:
        List of tuples containing GO term ID, GO term name, and gene count.
    """
    database = db.DB()
    session = database.session
    query = (
        session.query(
            models.GO.go_id,          # Select the GO ID
            models.GO.go_name,        # Select the GO Name
            func.count(models.Gene.id).label("gene_count")  # Count distinct genes
        )
        .join(models.annotations_go, models.annotations_go.c.go_id == models.GO.id)  # Join GO to annotations_go
        .join(models.Annotation, models.Annotation.id == models.annotations_go.c.annotation_id)  # Join annotations_go to annotations
        .join(models.Gene, models.Gene.id == models.Annotation.gene_id)  # Join annotations to genes
        .join(models.Species, models.Species.id == models.Gene.species_id)  # Join genes to species
        .group_by(models.GO.go_id, models.GO.go_name)  # Group by GO ID and name
        .having(func.count(models.Gene.id) > threshold)  # Filter by gene count greater than 0
        .order_by(func.count(models.Gene.id).asc())  # Order by gene count in ascending order
        .filter(models.Species.name == "X. elegans") 
    )

    return query.all()

print(get_go_terms_with_fewest_genes(13))

## 24-01-2025 Add Humilis Fasta data and neaten up interface
- add humilis species, data
- TODO make adding fasta info more efficient 
- 

In [1]:
# Add the species and gene sequences to the database

import db as db
import db_manager as dbm



database = db.DB()
species_name = "X. humilis"
fasta_file = "all_data/Xhumilis_Nov2024/Xhum_CDS_annot150424.fasta"

species = database.add_species(species_name)
species_id = species.id 
# dbm.add_gene_sequence_from_fasta(fasta_file, species_id)

In [None]:
# add the gene annotations to the database

annotation_file = "all_data/Xhumilis_Nov2024/20241111_Xhumilis_annotation_76405_export table_Oliver.csv"
dbm.add_gene_annotations(annotation_file, species_id)


In [None]:
import db as db
from sqlalchemy.orm import aliased
from sqlalchemy.sql import func
import models
def get_counts_by_species(species_id):
    # Count GO terms
    database = db.DB()
    session = database.session
    go_count = session.query(func.count(models.GO.id)) \
        .join(models.annotations_go) \
        .join(models.Annotation) \
        .join(models.Gene) \
        .filter(models.Gene.species_id == species_id) \
        .scalar()

    # Count InterPro terms
    interpro_count = session.query(func.count(models.InterPro.id)) \
        .join(models.annotations_interpro) \
        .join(models.Annotation) \
        .join(models.Gene) \
        .filter(models.Gene.species_id == species_id) \
        .scalar()

    # Count gene annotations
    annotation_count = session.query(func.count(models.Annotation.id)) \
        .join(models.Gene) \
        .filter(models.Gene.species_id == species_id) \
        .scalar()


    return {
        "GO_terms_count": go_count,
        "InterPro_count": interpro_count,
        "Gene_annotations_count": annotation_count
    }

# Example usage
for i in range(1, 4):
    counts = get_counts_by_species(i)
    print(counts)

293425 +312558 +1017868 = 1,623,851

## 14-02-2025 Add X elegans arabidopsis genes to database


In [None]:

def add_at_homologues():
    # add at common name
    # add locus
    # make all lower case
    # retreive gene ID, 