In [4]:
from __future__ import print_function
import argparse
import pprint
import numpy as np
import pandas as pd
import anndata

from GeneGloVe import GeneGloVe  # Assuming the GeneGloVe class is saved in this file
from scipy.sparse import coo_matrix

def read_gene_expression(filename):
    """
    Read a gene expression dataset from an .h5ad file and yield gene expression vectors.
    """
    adata = anndata.read_h5ad(filename)
    for row in adata.X:  # Assuming .X contains the expression data
        yield row


def create_co_occurrence_matrix(data, gene_names):
    """
    Create a co-occurrence matrix from gene expression data.
    Returns a COO format sparse matrix.
    """
    # Example logic for constructing a co-occurrence matrix
    num_genes = len(gene_names)
    rows, cols, data_values = [], [], []
    
    for expression in data:
        non_zero_indices = np.nonzero(expression)[0]
        for i in range(len(non_zero_indices)):
            for j in range(i + 1, len(non_zero_indices)):
                gene_i = non_zero_indices[i]
                gene_j = non_zero_indices[j]
                
                # Calculate co-occurrence (here using simple increment)
                rows.append(gene_i)
                cols.append(gene_j)
                data_values.append(1)  # Adjust based on your co-occurrence logic

    return coo_matrix((data_values, (rows, cols)), shape=(num_genes, num_genes))

if __name__ == '__main__':
    # Set up command line parameters.
    parser = argparse.ArgumentParser(description='Fit a GloVe model for gene embeddings.')

    parser.add_argument('--create', '-c', action='store',
                        default=None,
                        help='The filename of the gene expression dataset.')
    parser.add_argument('--train', '-t', action='store',
                        default=0,
                        help='Train the GloVe model with this number of epochs.')
    parser.add_argument('--parallelism', '-p', action='store',
                        default=1,
                        help='Number of parallel threads to use for training.')
    parser.add_argument('--query', '-q', action='store',
                        default='',
                        help='Get closest genes to this gene.')
    args = parser.parse_args()

    if args.create:
        # Read the gene expression data
        print('Reading gene expression dataset...')
        data = list(read_gene_expression(args.create))
        gene_names = np.array(pd.read_csv(args.create).columns[1:])  # Assuming the first column is not a gene

        # Create the co-occurrence matrix
        print('Creating co-occurrence matrix...')
        co_occurrence_matrix = create_co_occurrence_matrix(data, gene_names)

        # Save gene names for later use
        np.save('gene_names.npy', gene_names)

    if args.train:
        # Train the GloVe model and save it to disk.
        print('Training the GloVe model...')
        glove = GeneGloVe(no_components=100, learning_rate=0.05)
        glove.fit(co_occurrence_matrix, epochs=int(args.train),
                  no_threads=int(args.parallelism), verbose=True)

        # Load gene names for the dictionary
        gene_names = np.load('gene_names.npy', allow_pickle=True).tolist()
        gene_dictionary = {name: idx for idx, name in enumerate(gene_names)}
        glove.add_dictionary(gene_dictionary)

        # Save the model
        glove.save('gene_glove.model')

    if args.query:
        # Query the model for most similar genes.
        print('Loading pre-trained GloVe model...')
        glove = GeneGloVe.load('gene_glove.model')

        print('Querying for %s' % args.query)
        similar_genes = glove.most_similar(args.query, number=10)
        pprint.pprint(similar_genes)


usage: ipykernel_launcher.py [-h] [--create CREATE] [--train TRAIN]
                             [--parallelism PARALLELISM] [--query QUERY]
ipykernel_launcher.py: error: unrecognized arguments: -f /root/.local/share/jupyter/runtime/kernel-c2edb566-3324-4be2-9021-eded6af5c827.json


SystemExit: 2

In [7]:
gene_glove = GeneGloVe('/mnt/Cortex.h5ad')
expression_data, gene_names = gene_glove.read_data()
gene_glove.build_corpus(expression_data, gene_names)
gene_glove.fit_glove()
gene_glove.save_model("gene_glove.model")

AttributeError: 'GeneGloVe' object has no attribute 'build_corpus'