In [1]:
import pandas as pd
import numpy as np
import os
import json
from Bio.PDB.PDBParser import PDBParser
from io import StringIO
from Bio.PDB import PDBIO
from Bio.PDB.NeighborSearch import NeighborSearch
import h5py

### Cell exploration - Cell 7

### Gene annotation

In [2]:
# Read the GTF file using pandas
gencode = pd.read_csv('data/GSE80280/gencode.vM29.annotation.gtf', sep='\t', comment='#', header=None)

# chromosome name, annonation source, feature type, genomic start loc, end loc, score, genomic strand, genomic phase
# additional information as key-value pairs
column_names = ['chrName', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'key-value-pairs']
gencode.columns = column_names

# View the DataFrame
gencode.head()

Unnamed: 0,chrName,source,feature,start,end,score,strand,phase,key-value-pairs
0,chr1,HAVANA,gene,3143476,3144545,.,+,.,"gene_id ""ENSMUSG00000102693.2""; gene_type ""TEC..."
1,chr1,HAVANA,transcript,3143476,3144545,.,+,.,"gene_id ""ENSMUSG00000102693.2""; transcript_id ..."
2,chr1,HAVANA,exon,3143476,3144545,.,+,.,"gene_id ""ENSMUSG00000102693.2""; transcript_id ..."
3,chr1,ENSEMBL,gene,3172239,3172348,.,+,.,"gene_id ""ENSMUSG00000064842.3""; gene_type ""snR..."
4,chr1,ENSEMBL,transcript,3172239,3172348,.,+,.,"gene_id ""ENSMUSG00000064842.3""; transcript_id ..."


In [3]:
# Filter rows where the feature attribute is 'gene'
gene_rows = gencode[gencode['feature'] == 'gene'].copy()

# Add the row number as 'id'
gene_rows.loc[:, 'id'] = gene_rows.index

# Reset the index to make it chronological and drop the old index
gene_rows.reset_index(drop=True, inplace=True)

gene_rows

Unnamed: 0,chrName,source,feature,start,end,score,strand,phase,key-value-pairs,id
0,chr1,HAVANA,gene,3143476,3144545,.,+,.,"gene_id ""ENSMUSG00000102693.2""; gene_type ""TEC...",0
1,chr1,ENSEMBL,gene,3172239,3172348,.,+,.,"gene_id ""ENSMUSG00000064842.3""; gene_type ""snR...",3
2,chr1,HAVANA,gene,3276124,3741721,.,-,.,"gene_id ""ENSMUSG00000051951.6""; gene_type ""pro...",6
3,chr1,HAVANA,gene,3322980,3323459,.,+,.,"gene_id ""ENSMUSG00000102851.2""; gene_type ""pro...",24
4,chr1,HAVANA,gene,3435954,3438772,.,-,.,"gene_id ""ENSMUSG00000103377.2""; gene_type ""TEC...",27
...,...,...,...,...,...,...,...,...,...,...
55409,JH584299.1,ENSEMBL,gene,837364,840451,.,+,.,"gene_id ""ENSMUSG00000095523.2""; gene_type ""pro...",1869994
55410,JH584299.1,ENSEMBL,gene,910289,913083,.,-,.,"gene_id ""ENSMUSG00000095475.2""; gene_type ""pro...",1870005
55411,JH584299.1,ENSEMBL,gene,921942,924675,.,+,.,"gene_id ""ENSMUSG00000094855.2""; gene_type ""pro...",1870016
55412,JH584303.1,ENSEMBL,gene,81607,82689,.,+,.,"gene_id ""ENSMUSG00000095019.2""; gene_type ""pro...",1870027


In [6]:
#pdb data
# from Bio.PDB.PDBParser import PDBParser
folder_path = 'data/GSE80280/GSE80280-cell/'
parser = PDBParser()
genome_structure_model = parser.get_structure('genome_structure', folder_path+'cell7/GSM2219503_Cell_7_genome_structure_model.pdb')

In [8]:
# adding gene info to atoms and creating bonds
node_link_dict = {}
# Extract atom coordinates and other information
for model in genome_structure_model:
    atoms = []
    nodes = []
    links = []
    
    model_index = model.get_id()
    
    
    # Create NeighborSearch object
    ns = NeighborSearch(list(model.get_atoms()))
    
    
    for chain in model:
        for residue in chain:
            for atom in residue:
                atoms.append({
                    'atom_name': atom.get_id(),
                    'residue_name': residue.get_resname(),
                    'residue': residue.get_id()[1] * 100000,
                    'chain_id': atom.get_full_id()[2],
                    'id': atom.get_serial_number(),
                    'coords': atom.get_coord()
                })

    # Convert the atom information to a pandas DataFrame
    df = pd.DataFrame(atoms)
    
    # add gene information
    # Add a column for gene_id and initialize with None
    df['gene_id'] = None
    df['chrname'] = None
    
    # Iterate over gene_annotation to map atoms to genes
    for index, row in gene_rows.iterrows():
        gene_id = row['id']
        chrname = row['chrName']
        start = row['start']
        end = row['end']

        # Map atoms to this gene
        mask = (df['residue'] >= start) & (df['residue'] <= end)
        df.loc[mask, 'gene_id'] = gene_id
        df.loc[mask, 'chrname'] = chrname
        
# atom_name	residue_name	residue	chain_id	x	y	z	id	gene_id	chrname
    
    print(df['gene_id'].value_counts())
    for index, row in df.iterrows():
        nodes.append({
            "id": row['id'],
            "name": row["atom_name"],
            "residue_id" : row["residue"],
            "chain": row["chain_id"],
            "coord": row['coords'].tolist(),
            "gene_id": row["gene_id"],
            "chrname": row['chrname']
        })
        
        # Find neighbors within a distance threshold - radius
        neighbors = ns.search(row['coords'], 1.5)  # TODO : Adjust the radius as needed
        
        for neighbor in neighbors:
            neighbor_id = neighbor.get_serial_number()
            links.append({"source": row['id'], "target": neighbor_id})
            
    node_link_dict[model_index] = {"nodes": nodes, "edges": links}
    break

gene_id
1530756    520
1736805    400
1856165    400
1856670    400
1683515    400
          ... 
1758082     10
1859332     10
1596500     10
1799374     10
1596865      4
Name: count, Length: 155, dtype: int64


In [9]:
df['gene_id'].nunique()

155

In [10]:
df['gene_id'].value_counts()

gene_id
1530756    520
1736805    400
1856165    400
1856670    400
1683515    400
          ... 
1758082     10
1859332     10
1596500     10
1799374     10
1596865      4
Name: count, Length: 155, dtype: int64

In [83]:
# for nd in node_link_dict.keys():
#     json_file = f"genome_cell7_model{nd}_gene_info.json"
#     filePath = 'data/extracted-node-link/cell7/'+json_file
# #     print(json_file)
#     with open(filePath, "w") as outfile:
#         json.dump(node_link_dict[nd], outfile, indent=4)
#     print(f"Node-link dictionary saved to {json_file}")

Node-link dictionary saved to genome_cell7_model0_gene_info.json
Node-link dictionary saved to genome_cell7_model1_gene_info.json
Node-link dictionary saved to genome_cell7_model2_gene_info.json
Node-link dictionary saved to genome_cell7_model3_gene_info.json
Node-link dictionary saved to genome_cell7_model4_gene_info.json
Node-link dictionary saved to genome_cell7_model5_gene_info.json
Node-link dictionary saved to genome_cell7_model6_gene_info.json
Node-link dictionary saved to genome_cell7_model7_gene_info.json
Node-link dictionary saved to genome_cell7_model8_gene_info.json
Node-link dictionary saved to genome_cell7_model9_gene_info.json
