Skip to content

Commit

Permalink
code cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
maxibor committed Jun 30, 2021
1 parent 52c8dc4 commit 38992e4
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 120 deletions.
2 changes: 2 additions & 0 deletions pydamage/accuracy_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def fit_model(df, model):
df (pandas DataFrame): prepared pydamage results
model (pypmml model): GLM accuracy model
"""
print(df)
print(model.predict(df))
prediction = list(model.predict(df)["Predicted_sig"])
df["predicted_accuracy"] = prediction
return df["predicted_accuracy"].to_frame()
3 changes: 1 addition & 2 deletions pydamage/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import click
from pydamage.main import pydamage_analyze
from pydamage.citation import get_citation
from pydamage.kneedle import apply_filter
from pydamage.filter import apply_filter
from pydamage import __version__
from collections import OrderedDict

Expand Down Expand Up @@ -107,7 +107,6 @@ def filter(ctx, no_args_is_help=True, **kwargs):
CSV: path to PyDamage result file
"""
print(kwargs, ctx.obj)

apply_filter(**kwargs, **ctx.obj)

Expand Down
12 changes: 6 additions & 6 deletions pydamage/kneedle.py → pydamage/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pandas import read_csv


def find_knee(pydam_df, min_knee=0.5, alpha=0.05):
def define_threshold(pydam_df, min_knee=0.5, alpha=0.05):
"""Find kneedle point in PyDamage results
Finding the kneedle point to get the optimal
Expand All @@ -13,15 +13,15 @@ def find_knee(pydam_df, min_knee=0.5, alpha=0.05):
Args:
pydam_df (pandas df): pydamage results
min_knee (float, optional): Min pred_accuracy threshold. Defaults to 0.5
min_knee (float, optional): Min pred_accuracy threshold.
alpha(float, optional): Alpha q-value threshold
"""
thresholds = [i.round(2) for i in arange(min_knee, 1, 0.01)]
nb_contigs = list()
nb_contigs = []
for i in thresholds:
nb_contigs.append(
pydam_df.query(f"pred_accuracy >= {i} & qvalue <= {alpha}").shape[0]
pydam_df.query(f"predicted_accuracy >= {i} & qvalue <= {alpha}").shape[0]
)
kneedle = KneeLocator(
thresholds,
Expand All @@ -45,7 +45,7 @@ def filter_pydamage_results(pydam_df, acc_thresh, alpha=0.05):
alpha (float, optional): Alpha q-value threshold. Defaults to 0.05.
"""

return pydam_df.query(f"pred_accuracy >= {acc_thresh} & qvalue <= {alpha}")
return pydam_df.query(f"predicted_accuracy >= {acc_thresh} & qvalue <= {alpha}")


def apply_filter(csv, threshold, outdir, alpha=0.05):
Expand All @@ -61,11 +61,11 @@ def apply_filter(csv, threshold, outdir, alpha=0.05):
df = read_csv(csv)
outfile = "pydamage_filtered_results.csv"
if threshold == 0:
threshold = find_knee(df)
threshold = define_threshold(df)
print(f"Optimal prediction accuracy threshold found to be: {threshold}")
filt_df = filter_pydamage_results(df, acc_thresh=threshold)
print(
f"Filtering PyDamage results with qvalue <={alpha} and pred_accuracy >= {threshold}"
f"Filtering PyDamage results with qvalue <= {alpha} and pred_accuracy >= {threshold}"
)
df_to_csv(filt_df, outdir, outfile)
print(f"Filtered PyDamage results written to {outdir}/{outfile}")
Expand Down
112 changes: 0 additions & 112 deletions pydamage/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,115 +156,3 @@ def pydamage_analyze(

utils.df_to_csv(df, outdir)
return df


# def pydamage_analyze_group(
# bam,
# wlen=30,
# show_al=False,
# process=1,
# outdir="",
# plot=False,
# verbose=False,
# force=False,
# ):
# """Runs the pydamage analysis with all references grouped as one

# Args:
# bam(str): Path to alignment (sam/bam/cram) file
# wlen(int): window length
# show_al(bool): print alignments representations
# process(int): Number of processes for parellel computing
# outdir(str): Path to output directory
# verbose(bool): verbose mode
# force(bool): force overwriting of results directory
# Returns:
# pd.DataFrame: pandas DataFrame containg pydamage results
# """

# if verbose:
# print(f"Pydamage version {__version__}\n")
# utils.makedir(outdir, force=force)

# if not verbose:
# warnings.filterwarnings("ignore")

# mode = utils.check_extension(bam)
# alf = pysam.AlignmentFile(bam, mode)

# if not alf.has_index():
# print(
# f"BAM file {bam} has no index. Sort BAM file and provide index "
# "before running pydamage."
# )
# sys.exit(1)

# refs = list(alf.references)

# if len(refs) == 0:
# print(f"No aligned sequences in {bam}")
# return []

# proc = min(len(refs), process)

# get_damage_group_partial = partial(
# damage.get_damage,
# bam=bam,
# wlen=wlen,
# show_al=show_al,
# mode=mode,
# process=process,
# )
# print("Estimating and testing Damage")
# with multiprocessing.Pool(proc) as p:
# res = list(tqdm(p.imap(get_damage_group_partial, refs), total=len(refs)))
# ct_data = []
# ga_data = []
# cc_data = []
# all_bases = []
# cov = 0
# nb_ref = 0
# nb_reads_aligned = 0
# reflen = 0
# for i in res:
# ct_data += i[0]
# ga_data += i[1]
# cc_data += i[2]
# all_bases += i[5]
# cov += i[6]
# nb_ref += 1
# nb_reads_aligned += i[7]
# reflen += i[8]
# cov = cov / nb_ref

# if nb_reads_aligned == 0:
# raise AlignmentFileError("No Alignments were found\nCheck your alignment file")

# damage_dict = damage.test_damage_group(
# ct_data,
# ga_data,
# cc_data,
# all_bases,
# nb_reads_aligned,
# cov,
# reflen,
# wlen,
# verbose,
# )

# if plot:
# print("\nGenerating Pydamage plot")
# plotdir = f"{outdir}/plots"
# utils.makedir(plotdir, confirm=False)
# damageplot(damage_dict, outdir=plotdir)

# df_pydamage = utils.pandas_group_processing(res_dict=damage_dict)

# acc_model = load_model()
# prep_df_glm = prepare_data(df_pydamage)
# df_glm = fit_model(prep_df_glm, acc_model)

# df = df_glm.merge(df_pydamage, left_index=True, right_index=True)

# utils.df_to_csv(df, outdir)
# return df

0 comments on commit 38992e4

Please sign in to comment.