Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

coded #99

Merged
merged 2 commits into from Sep 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
15 changes: 7 additions & 8 deletions phippery/modeling.py
Expand Up @@ -16,6 +16,7 @@
from phippery.negbinom import fit_neg_binom
from phippery.negbinom import mlxp_neg_binom


def gamma_poisson_model(
ds,
starting_alpha=0.8,
Expand Down Expand Up @@ -71,6 +72,7 @@ def gamma_poisson_model(
ds_copy[new_table_name] = xr.DataArray(counts)
return (alpha, beta), ds_copy


def neg_binom_model(
ds,
beads_ds,
Expand Down Expand Up @@ -104,22 +106,19 @@ def neg_binom_model(
upper_bound = st.scoreatpercentile(beads_counts.values, trim_percentile)
trimmed_data = np.ma.masked_greater(beads_counts.values, upper_bound)

nb_mu = []
nb_mu = []
nb_alpha = []
nb_var = []
nb_size = []
nb_prob = []
nb_var = []
nb_size = []
nb_prob = []
for i in range(beads_counts.shape[0]):
(mu, alpha, var, size, prob) = fit_neg_binom(
trimmed_data[i].compressed(), nb_p
)
(mu, alpha, var, size, prob) = fit_neg_binom(trimmed_data[i].compressed(), nb_p)
nb_mu.append(mu)
nb_alpha.append(alpha)
nb_var.append(var)
nb_size.append(size)
nb_prob.append(prob)


counts = copy.deepcopy(ds[f"{data_table}"].to_pandas())
counts = counts.round(2)
counts.loc[:, :] = mlxp_neg_binom(counts, nb_size, nb_prob)
Expand Down
100 changes: 48 additions & 52 deletions phippery/normalize.py
Expand Up @@ -23,8 +23,8 @@

def standardized_enrichment(
ds,
ds_lib_control_indices,
ds_bead_control_indices,
lib_ds,
mock_ip_ds,
data_table="counts",
inplace=True,
new_table_name="std_enrichment",
Expand All @@ -36,6 +36,8 @@ def standardized_enrichment(
pseudo counts are added like so:
https://jbloomlab.github.io/dms_tools2/diffsel.html#id5

if inplace other values will not be tampered with.

:param: ds <xarray.Dataset> - An xarray dataset obtained from three tables
provided to phippery.collect

Expand All @@ -56,19 +58,17 @@ def standardized_enrichment(
f"{data_table} is not included in dataset. \n available datasets: {avail}"
)

if type(ds_lib_control_indices) != list:
raise ValueError(
"ds_lib_control_indicies must be of type list, even if there is only a single values"
)

if type(ds_bead_control_indices) != list:
raise ValueError(
"ds_bead_control_indicies must be of type list, even if there is only a single values"
)
# TODO and verify(ds) == True
for control in [lib_ds, mock_ip_ds]:
if type(control) != xr.Dataset:
raise ValueError(
"ds_lib_control_indicies must be of type list, even if there is only a single value"
)

ds_counts = ds[data_table].to_pandas()
std_enrichments = _comp_std_enr(
ds_counts, ds_lib_control_indices, ds_bead_control_indices
counts=ds[data_table].to_pandas(),
lib_counts=lib_ds[data_table].to_pandas(),
mock_ip_counts=mock_ip_ds[data_table].to_pandas(),
)

if inplace:
Expand All @@ -80,48 +80,52 @@ def standardized_enrichment(
return ds_copy


def _comp_std_enr(counts, lib_controls, beads_controls):
def _comp_std_enr(counts, lib_counts, mock_ip_counts):

std_enrichments = copy.deepcopy(counts)
normalized_ds_counts = copy.deepcopy(counts)

# find controls and average all
ds_lib_counts_mean = counts[lib_controls].mean(axis=1)
ds_bead_counts_mean = counts[beads_controls].mean(axis=1)
ds_lib_counts_mean = lib_counts.mean(axis=1)
ds_bead_counts_mean = mock_ip_counts.mean(axis=1)
ds_lib_counts_mean_sum = sum(ds_lib_counts_mean)

# compute beads control std enrichment
pseudo_sample = ds_bead_counts_mean + max(
1, sum(ds_bead_counts_mean) / ds_lib_counts_mean_sum
)
pseudo_lib_control = ds_lib_counts_mean + max(
pseudo_lib_counts = ds_lib_counts_mean + max(
1, ds_lib_counts_mean_sum / sum(ds_bead_counts_mean)
)
pseudo_sample_freq = pseudo_sample / sum(pseudo_sample)
pseudo_lib_control_freq = pseudo_lib_control / sum(pseudo_lib_control)
pseudo_bead_enrichment = pseudo_sample_freq / pseudo_lib_control_freq
pseudo_lib_counts_freq = pseudo_lib_counts / sum(pseudo_lib_counts)
pseudo_bead_enrichment = pseudo_sample_freq / pseudo_lib_counts_freq

# compute all sample standardized enrichment
for sample_id, sample in counts.iteritems():
if sample_id in list(lib_controls) + list(beads_controls):
continue
pseudo_sample = sample + max(1, sum(sample) / ds_lib_counts_mean_sum)
pseudo_lib_control = ds_lib_counts_mean + max(
pseudo_lib_counts = ds_lib_counts_mean + max(
1, ds_lib_counts_mean_sum / sum(sample)
)
pseudo_sample_freq = pseudo_sample / sum(pseudo_sample)
pseudo_lib_control_freq = pseudo_lib_control / sum(pseudo_lib_control)
sample_enrichment = pseudo_sample_freq / pseudo_lib_control_freq
std_enrichments.loc[:, sample_id] = sample_enrichment - pseudo_bead_enrichment
pseudo_lib_counts_freq = pseudo_lib_counts / sum(pseudo_lib_counts)
sample_enrichment = pseudo_sample_freq / pseudo_lib_counts_freq
normalized_ds_counts.loc[:, sample_id] = (
sample_enrichment - pseudo_bead_enrichment
)

return std_enrichments
return normalized_ds_counts


# def standardized_enrichment(
# ds,
# lib_ds,
# mock_ip_ds,
# data_table="counts",
# inplace=True,
# new_table_name="std_enrichment",
# ):
def enrichment(
ds,
ds_lib_control_indices,
data_table="counts",
inplace=True,
new_table_name="enrichment",
ds, lib_ds, data_table="counts", inplace=True, new_table_name="enrichment",
):
"""
return a new xarray dataset same as the input
Expand All @@ -144,14 +148,14 @@ def enrichment(
f"{data_table} is not included in dataset. \n available datasets: {avail}"
)

# we are going to add an augmented counts matrix
if type(ds_lib_control_indices) != list:
if type(lib_ds) != xr.Dataset:
raise ValueError(
"ds_lib_control_indicies must be of type list, even if there is only a single values"
"ds_lib_control_indicies must be of type list, even if there is only a single value"
)

ds_counts = ds[data_table].to_pandas()
enrichments = _comp_enr(ds_counts, ds_lib_control_indices,)
enrichments = _comp_enr(
counts=ds[data_table].to_pandas(), lib_counts=lib_ds[data_table]
)

if inplace:
ds[new_table_name] = xr.DataArray(enrichments)
Expand All @@ -162,22 +166,19 @@ def enrichment(
return ds_copy


def _comp_enr(counts_df, lib_controls):
def _comp_enr(counts, lib_counts):
# we are going to add an augmented counts matrix
enrichments = copy.deepcopy(counts_df)
enrichments = copy.deepcopy(counts)

# find controls and average all
ds_lib_counts_mean = enrichments[lib_controls].mean(axis=1)
ds_lib_counts_mean_sum = sum(ds_lib_counts_mean)
lib_counts_mean = lib_counts.mean(axis=1)
lib_counts_mean_sum = sum(lib_counts_mean)

# compute all sample standardized enrichment
for sample_id, sample in enrichments.iteritems():
if sample_id in list(lib_controls):
continue
pseudo_sample = sample + max(1, sum(sample) / ds_lib_counts_mean_sum)
pseudo_lib_control = ds_lib_counts_mean + max(
1, ds_lib_counts_mean_sum / sum(sample)
)

pseudo_sample = sample + max(1, sum(sample) / lib_counts_mean_sum)
pseudo_lib_control = lib_counts_mean + max(1, lib_counts_mean_sum / sum(sample))
pseudo_sample_freq = pseudo_sample / sum(pseudo_sample)
pseudo_lib_control_freq = pseudo_lib_control / sum(pseudo_lib_control)
sample_enrichment = pseudo_sample_freq / pseudo_lib_control_freq
Expand All @@ -186,11 +187,6 @@ def _comp_enr(counts_df, lib_controls):
return enrichments


###########################################################################
###########################################################################
###########################################################################


def svd_rank_reduction(
ds,
rank=1,
Expand Down
21 changes: 12 additions & 9 deletions test/test_normalize.py
Expand Up @@ -42,22 +42,25 @@ def test_comp_enr():
"""

for i in range(3, 7):
test_counts = pd.DataFrame(np.ones([i, 4])).astype(float)
std_enr = _comp_enr(test_counts, [3])
assert np.all(std_enr == test_counts)
enr = _comp_enr(
counts=pd.DataFrame(np.ones([i, 2])).astype(float),
lib_counts=pd.DataFrame(np.ones([i, 1])).astype(float),
)
assert np.all(enr == np.ones([i, 2]))


def test_comp_std_enr():
"""
test the standard enrichment of all ones which should result in zeros in all places including the
test the standard enrichment of all ones which should result in zeros in all places
"""

for i in range(3, 7):
test_counts = pd.DataFrame(np.ones([i, 4])).astype(float)
std_enr = _comp_std_enr(test_counts, [2], [3])
sol = copy.deepcopy(test_counts)
sol.loc[:, [0, 1]] = 0.0
assert np.all(std_enr == sol)
std_enr = _comp_std_enr(
counts=pd.DataFrame(np.ones([i, 2])).astype(float),
lib_counts=pd.DataFrame(np.ones([i, 1])).astype(float),
mock_ip_counts=pd.DataFrame(np.ones([i, 1])).astype(float),
)
assert np.all(std_enr == np.zeros([i, 2]))


def test_comp_diff_sel():
Expand Down