In [7]:
import pandas as pd
import numpy as np
import os
import json
from Bio.PDB.PDBParser import PDBParser
from io import StringIO
from Bio.PDB import PDBIO
from Bio.PDB.NeighborSearch import NeighborSearch
import h5py

### Cell exploration - Cell 7

### Gene annotation

In [8]:
# Read the GTF file using pandas
gencode = pd.read_csv('data/GSE248049/annotation_files/Chlorocebus_sabaeus_genome.gff', sep='\t', comment='#', header=None)

# chromosome name, annonation source, feature type, genomic start loc, end loc, score, genomic strand, genomic phase
# additional information as key-value pairs
column_names = ['chrName', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'key-value-pairs', "other"]
gencode.columns = column_names

# View the DataFrame
gencode.head()

  gencode = pd.read_csv('data/GSE248049/annotation_files/Chlorocebus_sabaeus_genome.gff', sep='\t', comment='#', header=None)


Unnamed: 0,chrName,source,feature,start,end,score,strand,phase,key-value-pairs,other
0,NC_023642.1,RefSeq,region,1,126035930,.,+,.,ID=NC_023642.1:1..126035930;Name=1;chromosome=...,1
1,NC_023642.1,Gnomon,gene,2628,4911,.,+,.,ID=gene-SCGB1C1;Dbxref=GeneID:103214198;Name=S...,1
2,NC_023642.1,Gnomon,mRNA,2628,4911,.,+,.,ID=rna-XM_007993457.1;Parent=gene-SCGB1C1;Dbxr...,1
3,NC_023642.1,Gnomon,exon,2628,3883,.,+,.,ID=exon-XM_007993457.1-1;Parent=rna-XM_0079934...,1
4,NC_023642.1,Gnomon,exon,4414,4911,.,+,.,ID=exon-XM_007993457.1-2;Parent=rna-XM_0079934...,1


In [12]:
gencode.iloc[0]['key-value-pairs']

'ID=NC_023642.1:1..126035930;Name=1;chromosome=1;country=USA: NC%2C Wake Forest University;dev-stage=adult;gbkey=Src;genome=chromosome;isolate=1994-021;mol_type=genomic DNA;note=housed at the Wake Forest Primate Facility as part of the Vervet Research Colony;sex=male'

In [9]:
# Filter rows where the feature attribute is 'gene'
gene_rows = gencode[gencode['feature'] == 'gene'].copy()

# Add the row number as 'id'
gene_rows.loc[:, 'id'] = gene_rows.index

# Reset the index to make it chronological and drop the old index
gene_rows.reset_index(drop=True, inplace=True)

gene_rows

Unnamed: 0,chrName,source,feature,start,end,score,strand,phase,key-value-pairs,other,id
0,NC_023642.1,Gnomon,gene,2628,4911,.,+,.,ID=gene-SCGB1C1;Dbxref=GeneID:103214198;Name=S...,1,1
1,NC_023642.1,Gnomon,gene,4916,10379,.,+,.,ID=gene-ODF3;Dbxref=GeneID:103237012;Name=ODF3...,1,10
2,NC_023642.1,Gnomon,gene,14423,18444,.,-,.,ID=gene-BET1L;Dbxref=GeneID:103216145;Name=BET...,1,49
3,NC_023642.1,Gnomon,gene,18597,25398,.,+,.,ID=gene-RIC8A;Dbxref=GeneID:103222128;Name=RIC...,1,60
4,NC_023642.1,Gnomon,gene,25740,49471,.,-,.,ID=gene-SIRT3;Dbxref=GeneID:103234991;Name=SIR...,1,165
...,...,...,...,...,...,...,...,...,...,...,...
29346,NC_023672.1,Gnomon,gene,4229367,4242824,.,-,.,ID=gene-LOC103246999;Dbxref=GeneID:103246999;N...,Y,1790656
29347,NC_023672.1,Gnomon,gene,4486881,4489237,.,-,.,ID=gene-LOC103247001;Dbxref=GeneID:103247001;N...,Y,1790673
29348,NC_023672.1,Gnomon,gene,4503583,4516685,.,-,.,ID=gene-LOC103247020;Dbxref=GeneID:103247020;N...,Y,1790679
29349,NC_023672.1,Gnomon,gene,5352628,5364843,.,+,.,ID=gene-LOC103247005;Dbxref=GeneID:103247005;N...,Y,1790696


In [11]:
gene_rows.iloc[0]['key-value-pairs']

'ID=gene-SCGB1C1;Dbxref=GeneID:103214198;Name=SCGB1C1;gbkey=Gene;gene=SCGB1C1;gene_biotype=protein_coding'

In [6]:
#pdb data
# from Bio.PDB.PDBParser import PDBParser
folder_path = 'data/GSE80280/GSE80280-cell/'
parser = PDBParser()
genome_structure_model = parser.get_structure('genome_structure', folder_path+'cell7/GSM2219503_Cell_7_genome_structure_model.pdb')

In [8]:
# adding gene info to atoms and creating bonds
node_link_dict = {}
# Extract atom coordinates and other information
for model in genome_structure_model:
    atoms = []
    nodes = []
    links = []
    
    model_index = model.get_id()
    
    
    # Create NeighborSearch object
    ns = NeighborSearch(list(model.get_atoms()))
    
    
    for chain in model:
        for residue in chain:
            for atom in residue:
                atoms.append({
                    'atom_name': atom.get_id(),
                    'residue_name': residue.get_resname(),
                    'residue': residue.get_id()[1] * 100000,
                    'chain_id': atom.get_full_id()[2],
                    'id': atom.get_serial_number(),
                    'coords': atom.get_coord()
                })

    # Convert the atom information to a pandas DataFrame
    df = pd.DataFrame(atoms)
    
    # add gene information
    # Add a column for gene_id and initialize with None
    df['gene_id'] = None
    df['chrname'] = None
    
    # Iterate over gene_annotation to map atoms to genes
    for index, row in gene_rows.iterrows():
        gene_id = row['id']
        chrname = row['chrName']
        start = row['start']
        end = row['end']

        # Map atoms to this gene
        mask = (df['residue'] >= start) & (df['residue'] <= end)
        df.loc[mask, 'gene_id'] = gene_id
        df.loc[mask, 'chrname'] = chrname
        
# atom_name	residue_name	residue	chain_id	x	y	z	id	gene_id	chrname
    
    print(df['gene_id'].value_counts())
    for index, row in df.iterrows():
        nodes.append({
            "id": row['id'],
            "name": row["atom_name"],
            "residue_id" : row["residue"],
            "chain": row["chain_id"],
            "coord": row['coords'].tolist(),
            "gene_id": row["gene_id"],
            "chrname": row['chrname']
        })
        
        # Find neighbors within a distance threshold - radius
        neighbors = ns.search(row['coords'], 1.5)  # TODO : Adjust the radius as needed
        
        for neighbor in neighbors:
            neighbor_id = neighbor.get_serial_number()
            links.append({"source": row['id'], "target": neighbor_id})
            
    node_link_dict[model_index] = {"nodes": nodes, "edges": links}
    break

gene_id
1530756    520
1736805    400
1856165    400
1856670    400
1683515    400
          ... 
1758082     10
1859332     10
1596500     10
1799374     10
1596865      4
Name: count, Length: 155, dtype: int64


In [9]:
df['gene_id'].nunique()

155

In [10]:
df['gene_id'].value_counts()

gene_id
1530756    520
1736805    400
1856165    400
1856670    400
1683515    400
          ... 
1758082     10
1859332     10
1596500     10
1799374     10
1596865      4
Name: count, Length: 155, dtype: int64

In [83]:
# for nd in node_link_dict.keys():
#     json_file = f"genome_cell7_model{nd}_gene_info.json"
#     filePath = 'data/extracted-node-link/cell7/'+json_file
# #     print(json_file)
#     with open(filePath, "w") as outfile:
#         json.dump(node_link_dict[nd], outfile, indent=4)
#     print(f"Node-link dictionary saved to {json_file}")

Node-link dictionary saved to genome_cell7_model0_gene_info.json
Node-link dictionary saved to genome_cell7_model1_gene_info.json
Node-link dictionary saved to genome_cell7_model2_gene_info.json
Node-link dictionary saved to genome_cell7_model3_gene_info.json
Node-link dictionary saved to genome_cell7_model4_gene_info.json
Node-link dictionary saved to genome_cell7_model5_gene_info.json
Node-link dictionary saved to genome_cell7_model6_gene_info.json
Node-link dictionary saved to genome_cell7_model7_gene_info.json
Node-link dictionary saved to genome_cell7_model8_gene_info.json
Node-link dictionary saved to genome_cell7_model9_gene_info.json
