-
Notifications
You must be signed in to change notification settings - Fork 0
/
GenerateSyntheticExpressionMatrix.py
150 lines (121 loc) · 5.97 KB
/
GenerateSyntheticExpressionMatrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# coding: utf-8
import os
#os.chdir('/Users/louis.cammarata/Documents/ResearchProjects/2018/COMET/COMET-Simulations')
import hgmd_v1 as hgmd
import pandas as pd
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
from matplotlib import cm
# Generate cells-by-gene matrix
def createExpressionMat(n_cells,
cluster_size,
c1_loc, c1_var,
c0_loc,c0_var,
n_outliers,
cout_loc,cout_var,
normal_dist=True):
# Create cell names
index = ['cell_'+str(i) for i in np.arange(0,n_cells)]
# Create cluster assignment vector
cluster = np.concatenate((np.repeat(1,cluster_size),np.repeat(0,n_cells-cluster_size)),axis=0)
# Create gene expression vector
if (normal_dist==True):
# Create gene expression vector in cluster 0
gene_c0 = np.random.normal(loc=c0_loc,scale=c0_var,size=n_cells-cluster_size)
# Create gene expression vector in cluster 1 for non outliers
gene_c1_std = np.random.normal(loc=c1_loc,scale=c1_var,size=cluster_size-n_outliers)
# Create gene expression vector in cluster 1 for outliers
gene_c1_out = np.random.normal(loc=cout_loc,scale=cout_var,size=n_outliers)
else:
# Create gene expression vector in cluster 0
gene_c0 = ss.cauchy.rvs(loc=c0_loc,scale=c0_var,size=n_cells-cluster_size)
# Create gene expression vector in cluster 1 for non outliers
gene_c1_std = ss.cauchy.rvs(loc=c1_loc,scale=c1_var,size=cluster_size-n_outliers)
# Create gene expression vector in cluster 1 for outliers
gene_c1_out = ss.cauchy.rvs(loc=cout_loc,scale=cout_var,size=n_outliers)
gene_exp = np.concatenate((gene_c1_out,gene_c1_std,gene_c0),axis=0)
# Create dataframe
tmp = {'cluster':cluster,'gene':gene_exp}
cell_data = pd.DataFrame(tmp,index=index)
return(cell_data)
def createNBExpressionMat(n_cells,
cluster_size,
c1_n, c1_p,
c0_n,c0_p,
n_outliers,
cout_n,cout_p):
# Create cell names
index = ['cell_'+str(i) for i in np.arange(0,n_cells)]
# Create cluster assignment vector
cluster = np.concatenate((np.repeat(1,cluster_size),np.repeat(0,n_cells-cluster_size)),axis=0)
# Create gene expression vector
# Create gene expression vector in cluster 0
gene_c0 = np.random.negative_binomial(n=c0_n, p=c0_p, size=n_cells-cluster_size)
# Create gene expression vector in cluster 1 for non outliers
gene_c1_std = np.random.negative_binomial(n=c1_n, p=c1_p, size=cluster_size-n_outliers)
# Create gene expression vector in cluster 1 for outliers
gene_c1_out = np.random.negative_binomial(n=cout_n, p=cout_p, size=n_outliers)
gene_exp = np.concatenate((gene_c1_out,gene_c1_std,gene_c0),axis=0)
# Normalize
gene_exp = gene_exp*np.power(10,6)/np.sum(gene_exp)
# Create dataframe
tmp = {'cluster':cluster,'gene':gene_exp}
cell_data = pd.DataFrame(tmp,index=index)
return(cell_data)
def createNBExpressionMat2(n_cells,
cluster_size,
c1_n, c1_p,
c0_n,c0_p,
offset,
n_outliers,
cout_n,cout_p):
# Create cell names
index = ['cell_'+str(i) for i in np.arange(0,n_cells)]
# Create cluster assignment vector
cluster = np.concatenate((np.repeat(1,cluster_size),np.repeat(0,n_cells-cluster_size)),axis=0)
# Create gene expression vector
# Create gene expression vector in cluster 0
gene_c0 = np.random.negative_binomial(n=c0_n, p=c0_p, size=n_cells-cluster_size)
# Create gene expression vector in cluster 1 for non outliers
gene_c1_std = np.random.negative_binomial(n=c1_n, p=c1_p, size=cluster_size-n_outliers)+offset
# Create gene expression vector in cluster 1 for outliers
gene_c1_out = np.random.negative_binomial(n=cout_n, p=cout_p, size=n_outliers)
gene_exp = np.concatenate((gene_c1_out,gene_c1_std,gene_c0),axis=0)
# Normalize
#gene_exp = gene_exp*np.power(10,6)/np.sum(gene_exp)
# Create dataframe
tmp = {'cluster':cluster,'gene':gene_exp}
cell_data = pd.DataFrame(tmp,index=index)
return(cell_data)
# Generate cells-by-genes matrix
def createMultipleExpressionMat(n_cells,
cluster_size,
params,
c0_loc,c0_var,
normal_dist=True):
# Create cell names
index = ['cell_'+str(i) for i in np.arange(0,n_cells)]
# Create gene names
n_genes = len(params)
genes = ['gene_'+str(i) for i in np.arange(0,n_genes)]
# Create cluster assignment vector
cluster = np.concatenate((np.repeat(1,cluster_size),np.repeat(0,n_cells-cluster_size)),axis=0)
# Create dataframe
tmp = {'cluster':cluster}
cell_data = pd.DataFrame(tmp,index=index)
for g in np.arange(0,n_genes):
# Create gene expression vector
if (normal_dist==True):
# Create gene expression vector in cluster 0
gene_c0 = np.random.normal(loc=c0_loc,scale=c0_var,size=n_cells-cluster_size)
# Create gene expression vector in cluster 1 for non outliers
gene_c1 = np.random.normal(loc=params[g][0],scale=params[g][1],size=cluster_size)
else:
# Create gene expression vector in cluster 0
gene_c0 = ss.cauchy.rvs(loc=c0_loc,scale=c0_var,size=n_cells-cluster_size)
# Create gene expression vector in cluster 1 for non outliers
gene_c1 = ss.cauchy.rvs(loc=params[g][0],scale=params[g][1],size=cluster_size)
gene_exp = np.concatenate((gene_c1,gene_c0),axis=0)
cell_data[genes[g]] = gene_exp
return(cell_data)