# Gene2Vec

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# Imports
import numpy as np
import pandas as pd

from pathlib import Path
from util import *
from tcga_dna import *

# Pre-processing MsigDb

In [3]:
def read_msigdb(path):
    """ Read an MsigDb (single) file and return a dictionary (by gene set) of dictionaries (gene name: 1) """
    msigdb = dict()
    for line in path.read_text().split('\n'):
        fields = line.split('\t')
        geneset_name = fields[0]
        if geneset_name:
            msigdb[geneset_name] = {f: 1 for f in fields[2:]}
    return msigdb


def read_msigdb_all(path, regex="*.gmt"):
    """ Read all MsigDb files, return a dictionary of lists of gene names """
    msigdb = dict()
    for p in path.find_files(regex):
        print(f"File: {p}")
        msigdb.update(read_msigdb(p))
    return msigdb


def msigdb2genes(msigdb):
    """ Get a (sorted) list of all genes in MsigDb """
    genes = set([g for by_gene in msigdb.values() for g in by_gene.keys()])
    genes = list(genes)
    genes.sort()
    return genes


def msigdb2gene_sets(msigdb):
    """ Get a (sorted) list of all Gene-Sets in MsigDb """
    gs = list(msigdb.keys())
    gs.sort()
    return gs


def msigdb2df(path):
    """ Read all MsigDb in the path and create a dataframe """
    msigdb = read_msigdb_all(path)
    df = pd.DataFrame(msigdb, dtype='int8', index=msigdb2genes(msigdb), columns=msigdb2gene_sets(msigdb))
    df.fillna(0, inplace=True)
    return df.transpose()


def geneset_gene_pairs(msigdb):
    """ Iterate over all (geneset, gene) pairs from MsigDb dictionary """
    for gs, genes in msigdb.items():
        for gene in genes.keys():
            yield gs, gene

def save_pairs(msigdb, path_save):
    pairs_str = '\n'.join([f"{gs},{g}" for gs,g in geneset_gene_pais(msigdb)])
    path_save.write_text(pairs_str)

In [6]:
path = Path('data/msigdb')
msigdb = read_msigdb_all(path)

File: data/msigdb/c6.all.v7.0.symbols.gmt
File: data/msigdb/c2.all.v7.0.symbols.gmt
File: data/msigdb/c7.all.v7.0.symbols.gmt
File: data/msigdb/c5.all.v7.0.symbols.gmt
File: data/msigdb/h.all.v7.0.symbols.gmt


In [None]:
save_pairs(msigdb, path/'msigdb_pairs.txt')