In [1]:
import pandas as pd
from pymongo import MongoClient
import plotly.express as px
import itertools
from tqdm import tqdm
from common.database import init_database
from itertools import combinations
import random
from iteration_utilities import random_combination
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import numpy as np
from scipy.stats import kruskal
import itertools
import pandas as pd

In [2]:
# Requires the PyMongo package.
# https://api.mongodb.com/python/current


db = init_database('omics-database')
result = db['SomaticMutation'].aggregate([
    {
        '$group': {
            '_id': '$name',
            'count': {
                '$sum': '$value'
            }
        }
    }
])
df = pd.DataFrame(result)
df = df.sort_values(['count'], axis=0, ascending=False)

In [3]:
df[:10]

Unnamed: 0,_id,count
35069,PIK3CA,273
38,TP53,264
31755,TTN,137
24407,CDH1,102
39576,GATA3,97
39392,MUC16,78
33615,KMT2C,77
3476,MAP3K1,66
32660,SYNE1,50
21828,PTEN,48


In [4]:
px.bar(df[:40], x="_id", y="count")

In [5]:
names = df._id[:10]
names = list(set(list(names) + ['BRCA1']))
names

['TP53',
 'GATA3',
 'MUC16',
 'MAP3K1',
 'KMT2C',
 'SYNE1',
 'PTEN',
 'BRCA1',
 'CDH1',
 'PIK3CA',
 'TTN']

In [6]:
out = []
for item in tqdm(x for l in range(2, 6) for x in itertools.combinations(names, l)):
    try:
        count = next(db['SomaticMutation'].aggregate([
            {
                '$match': {
                    'name': {
                        '$in': item
                    },
                    'value': 1
                }
            }, {
                '$group': {
                    '_id': '$patient',
                    'names': {
                        '$push': '$name'
                    },
                    'patients': {'$push': "$patient"}
                }
            }, {
                '$addFields': {
                    'count': {
                        '$size': '$names'
                    }
                }
            }, {
                '$match': {
                    'count': len(item)
                }
            }, {
                '$count': 'count'
            }
        ]))['count']
        if count >= 2:
            out.append({'genes': str(item), 'count': count})
    except:
        pass

1012it [00:12, 83.01it/s]


In [7]:
out = sorted(out, key=lambda x: (len(x['genes']), x['count']))
df = pd.DataFrame(out)
df

Unnamed: 0,genes,count
0,"('PTEN', 'TTN')",17
1,"('CDH1', 'TTN')",26
2,"('TP53', 'TTN')",55
3,"('TP53', 'CDH1')",6
4,"('BRCA1', 'TTN')",9
...,...,...
178,"('MUC16', 'KMT2C', 'SYNE1', 'CDH1', 'TTN')",2
179,"('MUC16', 'KMT2C', 'CDH1', 'PIK3CA', 'TTN')",2
180,"('MUC16', 'MAP3K1', 'CDH1', 'PIK3CA', 'TTN')",2
181,"('MUC16', 'KMT2C', 'SYNE1', 'PIK3CA', 'TTN')",3


In [8]:
fig = px.bar(df, x='genes', y='count')
fig.show()

In [9]:
def count_combs(db, comb):
    try:
        count = next(db['SomaticMutation'].aggregate([
            {
                '$match': {
                    'name': {
                        '$in': comb
                    },
                    'value': 1
                }
            }, {
                '$group': {
                    '_id': '$patient',
                    'names': {
                        '$push': '$name'
                    },
                    'patients': {'$push': "$patient"}
                }
            }, {
                '$addFields': {
                    'count': {
                        '$size': '$names'
                    }
                }
            }, {
                '$match': {
                    'count': len(comb)
                }
            }, {
                '$count': 'count'
            }
        ]))['count']
        return count
    except:
        return 0

In [10]:
futures = []
db = init_database('omics-database')
names = db['SomaticMutation'].find().distinct('name')
with ThreadPoolExecutor(max_workers=20) as executor:
    for i in tqdm(range(int(1e5))):
        comb = random_combination(names, 1)
        futures.append(executor.submit(count_combs, db, list(comb)))
values = [future.result() for future in tqdm(futures)]

In [25]:
px.histogram(pd.DataFrame(dict(values=values)), nbins=100, histnorm='probability')