In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
import random

This is the python implementation of the MutEx algorithm described in the PhD Thesis "Analysis and visualization of multidimensional cancer genomics data" : http://www.tdx.cat/handle/10803/301436

The same algorithm is also implement in Java in the Gitools software: http://www.gitools.org

## Create a random sparse DataFrame reperesenting mutations

In [2]:
row, col = 100, 100
np.random.seed(77)
df = pd.DataFrame(sparse.random(row, col, density=0.15).A).apply(np.ceil)

df.loc[0] = [1 if x < 20 else 0 for x in range(0, df.shape[1])]
df.loc[1] = [1 if x > 13 and x < 35 else 0 for x in range(0, df.shape[1])]
df.loc[2] = [1 if x > 80 else 0 for x in range(0, df.shape[1])]

df.columns = ['s' + str(x) for x in df.columns]
df.index = ['gene' + str(x) for x in df.index]

pd.set_option('display.max_columns', 1000)
df.head()

Unnamed: 0,s0,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21,s22,s23,s24,s25,s26,s27,s28,s29,s30,s31,s32,s33,s34,s35,s36,s37,s38,s39,s40,s41,s42,s43,s44,s45,s46,s47,s48,s49,s50,s51,s52,s53,s54,s55,s56,s57,s58,s59,s60,s61,s62,s63,s64,s65,s66,s67,s68,s69,s70,s71,s72,s73,s74,s75,s76,s77,s78,s79,s80,s81,s82,s83,s84,s85,s86,s87,s88,s89,s90,s91,s92,s93,s94,s95,s96,s97,s98,s99
gene0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
gene1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
gene2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
gene3,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
gene4,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## We import mutex and create a MutEx instance

The MutEx has to be created with the full data matrix - so it has the correct background event (mutation) rate for both samples and genes

In [3]:
from mutex.mutex import MutEx

In [4]:
m = MutEx(background=df, permutations=2000)

### Example run 

In [5]:
%time r = m.calculate(['gene4', 'gene5', 'gene6'], parallel=False)
print(r)



CPU times: user 31.7 ms, sys: 8.15 ms, total: 39.8 ms
Wall time: 5.94 s
MuTexResult
  Mutual Exclusive p-value:   0.7515
  Co-occurence p-value:       0.4245
  Permutations:               2000
  Sample Coverage:            40.0
  Signal:                     48.0


### Example run , multi-threaded

In [6]:
%time r = m.calculate(['gene0', 'gene1', 'gene2'])
print(r)


CPU times: user 121 ms, sys: 70.5 ms, total: 192 ms
Wall time: 891 ms
MuTexResult
  Mutual Exclusive p-value:   0.0135
  Co-occurence p-value:       1.0
  Permutations:               2000
  Sample Coverage:            54.0
  Signal:                     60.0


### Example with many groups

In [7]:
#some random groups and put it in a generator (alternatively list)
random.seed(18)
group_generator = (random.sample( df.index.tolist(), random.sample([2,3,4], 1)[0]) for x in range(10) )

In [8]:
result_list = [m.calculate(g) for g in group_generator]
result_df = pd.DataFrame.from_records([r.__dict__ for r in result_list])
result_df[['items','coverage', 'signal', 'mutex_pvalue', 'co_occurence_pvalue', 'signal_coverage_ratio', 'mean_sim_coverage']].sort_values('mutex_pvalue')


Unnamed: 0,items,coverage,signal,mutex_pvalue,co_occurence_pvalue,signal_coverage_ratio,mean_sim_coverage
4,"[gene32, gene88]",39,39,0.015,1.0,1.0,35.2365
1,"[gene42, gene30, gene25]",48,54,0.1175,0.9325,0.888889,44.7295
6,"[gene97, gene22, gene30, gene21]",55,68,0.161,0.907,0.808824,51.8075
8,"[gene73, gene64, gene25]",46,53,0.2615,0.8695,0.867925,44.1655
7,"[gene25, gene94]",34,36,0.348,0.8325,0.944444,32.885
2,"[gene80, gene63, gene23]",42,48,0.468,0.676,0.875,41.326
3,"[gene37, gene58, gene33]",39,47,0.751,0.466,0.829787,39.78
0,"[gene15, gene84]",28,31,0.843,0.341,0.903226,28.864
9,"[gene96, gene63, gene27, gene91]",43,56,0.854,0.2505,0.767857,44.902
5,"[gene41, gene66]",29,36,0.988,0.012,0.805556,32.67
