code cleanup

maxibor · Jun 30, 2021 · 38992e4 · 38992e4
1 parent 52c8dc4
commit 38992e4
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 120 deletions.
diff --git a/pydamage/accuracy_model.py b/pydamage/accuracy_model.py
@@ -35,6 +35,8 @@ def fit_model(df, model):
         df (pandas DataFrame): prepared pydamage results
         model (pypmml model): GLM accuracy model
     """
+    print(df)
+    print(model.predict(df))
     prediction = list(model.predict(df)["Predicted_sig"])
     df["predicted_accuracy"] = prediction
     return df["predicted_accuracy"].to_frame()
diff --git a/pydamage/cli.py b/pydamage/cli.py
@@ -3,7 +3,7 @@
 import click
 from pydamage.main import pydamage_analyze
 from pydamage.citation import get_citation
-from pydamage.kneedle import apply_filter
+from pydamage.filter import apply_filter
 from pydamage import __version__
 from collections import OrderedDict
 
@@ -107,7 +107,6 @@ def filter(ctx, no_args_is_help=True, **kwargs):
 
     CSV: path to PyDamage result file
     """
-    print(kwargs, ctx.obj)
 
     apply_filter(**kwargs, **ctx.obj)
 

diff --git a/pydamage/kneedle.py → pydamage/filter.py b/pydamage/kneedle.py → pydamage/filter.py
@@ -4,7 +4,7 @@
 from pandas import read_csv
 
 
-def find_knee(pydam_df, min_knee=0.5, alpha=0.05):
+def define_threshold(pydam_df, min_knee=0.5, alpha=0.05):
     """Find kneedle point in PyDamage results
 
     Finding the kneedle point to get the optimal
@@ -13,15 +13,15 @@ def find_knee(pydam_df, min_knee=0.5, alpha=0.05):
 
     Args:
         pydam_df (pandas df): pydamage results
-        min_knee (float, optional): Min pred_accuracy threshold. Defaults to 0.5
+        min_knee (float, optional): Min pred_accuracy threshold.
         alpha(float, optional): Alpha q-value threshold
     """
     thresholds = [i.round(2) for i in arange(min_knee, 1, 0.01)]
     nb_contigs = list()
     nb_contigs = []
     for i in thresholds:
         nb_contigs.append(
-            pydam_df.query(f"pred_accuracy >= {i} & qvalue <= {alpha}").shape[0]
+            pydam_df.query(f"predicted_accuracy >= {i} & qvalue <= {alpha}").shape[0]
         )
     kneedle = KneeLocator(
         thresholds,
@@ -45,7 +45,7 @@ def filter_pydamage_results(pydam_df, acc_thresh, alpha=0.05):
         alpha (float, optional): Alpha q-value threshold. Defaults to 0.05.
     """
 
-    return pydam_df.query(f"pred_accuracy >= {acc_thresh} & qvalue <= {alpha}")
+    return pydam_df.query(f"predicted_accuracy >= {acc_thresh} & qvalue <= {alpha}")
 
 
 def apply_filter(csv, threshold, outdir, alpha=0.05):
@@ -61,11 +61,11 @@ def apply_filter(csv, threshold, outdir, alpha=0.05):
     df = read_csv(csv)
     outfile = "pydamage_filtered_results.csv"
     if threshold == 0:
-        threshold = find_knee(df)
+        threshold = define_threshold(df)
         print(f"Optimal prediction accuracy threshold found to be: {threshold}")
     filt_df = filter_pydamage_results(df, acc_thresh=threshold)
     print(
-        f"Filtering PyDamage results with qvalue <={alpha} and pred_accuracy >= {threshold}"
+        f"Filtering PyDamage results with qvalue <= {alpha} and pred_accuracy >= {threshold}"
     )
     df_to_csv(filt_df, outdir, outfile)
     print(f"Filtered PyDamage results written to {outdir}/{outfile}")

diff --git a/pydamage/main.py b/pydamage/main.py
@@ -156,115 +156,3 @@ def pydamage_analyze(
 
     utils.df_to_csv(df, outdir)
     return df
-
-
-# def pydamage_analyze_group(
-#     bam,
-#     wlen=30,
-#     show_al=False,
-#     process=1,
-#     outdir="",
-#     plot=False,
-#     verbose=False,
-#     force=False,
-# ):
-#     """Runs the pydamage analysis with all references grouped as one
-
-#     Args:
-#         bam(str): Path to alignment (sam/bam/cram) file
-#         wlen(int): window length
-#         show_al(bool): print alignments representations
-#         process(int):  Number of  processes for parellel computing
-#         outdir(str): Path to output directory
-#         verbose(bool): verbose mode
-#         force(bool): force overwriting of results directory
-#     Returns:
-#         pd.DataFrame: pandas DataFrame containg pydamage results
-#     """
-
-#     if verbose:
-#         print(f"Pydamage version {__version__}\n")
-#     utils.makedir(outdir, force=force)
-
-#     if not verbose:
-#         warnings.filterwarnings("ignore")
-
-#     mode = utils.check_extension(bam)
-#     alf = pysam.AlignmentFile(bam, mode)
-
-#     if not alf.has_index():
-#         print(
-#             f"BAM file {bam} has no index. Sort BAM file and provide index "
-#             "before running pydamage."
-#         )
-#         sys.exit(1)
-
-#     refs = list(alf.references)
-
-#     if len(refs) == 0:
-#         print(f"No aligned sequences in {bam}")
-#         return []
-
-#     proc = min(len(refs), process)
-
-#     get_damage_group_partial = partial(
-#         damage.get_damage,
-#         bam=bam,
-#         wlen=wlen,
-#         show_al=show_al,
-#         mode=mode,
-#         process=process,
-#     )
-#     print("Estimating and testing Damage")
-#     with multiprocessing.Pool(proc) as p:
-#         res = list(tqdm(p.imap(get_damage_group_partial, refs), total=len(refs)))
-#     ct_data = []
-#     ga_data = []
-#     cc_data = []
-#     all_bases = []
-#     cov = 0
-#     nb_ref = 0
-#     nb_reads_aligned = 0
-#     reflen = 0
-#     for i in res:
-#         ct_data += i[0]
-#         ga_data += i[1]
-#         cc_data += i[2]
-#         all_bases += i[5]
-#         cov += i[6]
-#         nb_ref += 1
-#         nb_reads_aligned += i[7]
-#         reflen += i[8]
-#     cov = cov / nb_ref
-
-#     if nb_reads_aligned == 0:
-#         raise AlignmentFileError("No Alignments were found\nCheck your alignment file")
-
-#     damage_dict = damage.test_damage_group(
-#         ct_data,
-#         ga_data,
-#         cc_data,
-#         all_bases,
-#         nb_reads_aligned,
-#         cov,
-#         reflen,
-#         wlen,
-#         verbose,
-#     )
-
-#     if plot:
-#         print("\nGenerating Pydamage plot")
-#         plotdir = f"{outdir}/plots"
-#         utils.makedir(plotdir, confirm=False)
-#         damageplot(damage_dict, outdir=plotdir)
-
-#     df_pydamage = utils.pandas_group_processing(res_dict=damage_dict)
-
-#     acc_model = load_model()
-#     prep_df_glm = prepare_data(df_pydamage)
-#     df_glm = fit_model(prep_df_glm, acc_model)
-
-#     df = df_glm.merge(df_pydamage, left_index=True, right_index=True)
-
-#     utils.df_to_csv(df, outdir)
-#     return df