Skip to content

Commit

Permalink
adding accuracy modelling
Browse files Browse the repository at this point in the history
  • Loading branch information
maxibor committed Sep 18, 2020
1 parent fda940e commit 04e9727
Show file tree
Hide file tree
Showing 6 changed files with 26 additions and 142 deletions.
21 changes: 8 additions & 13 deletions pydamage/accuracy_model.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
from pypmml import Model
import pkg_resources
import pandas as pd
import numpy as np
import pickle


def load_model():
"""Returns the pmml model"""
# This is a stream,like object. If you want the actual info, call
# stream.read()
stream = pkg_resources.resource_stream(__name__, "models/accuracy_model.xml")
return Model.load(stream)
stream = pkg_resources.resource_stream(__name__, "models/glm_accuracy_model.pickle")
return pickle.load(stream)


def prepare_data(pd_df):
Expand Down Expand Up @@ -70,20 +70,15 @@ def prepare_data(pd_df):
]
simu_cov = pd.cut(pd_df["coverage"], coverage_bins)
simu_cov.cat.rename_categories(coverage_bins_labels, inplace=True)
pd_df["simu_cov"] = simu_cov
pd_df["simuCov"] = simu_cov

simu_contig_length = pd.cut(pd_df["reflen"], reflen_bins)
simu_contig_length.cat.rename_categories(reflen_bins_labels, inplace=True)

pd_df["simu_contig_length"] = simu_contig_length
pd_df["simuContigLength"] = simu_contig_length
pd_df = pd_df[
[
"simu_cov",
"simu_contig_length",
"damage_model_pmax",
"gc_content",
]
].rename(columns={"damage_model_pmax": "damage"})
["simuCov", "simuContigLength", "damage_model_pmax", "gc_content"]
].rename(columns={"damage_model_pmax": "damage", "gc_content": "GCcontent"})

return pd_df

Expand All @@ -95,4 +90,4 @@ def fit_model(df, model):
df (pandas DataFrame): prepared pydamage results
model (pypmml model): GLM accuracy model
"""
return model.predict(df)
return model.predict(df).to_frame(name="pred_accuracy")
11 changes: 6 additions & 5 deletions pydamage/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,13 @@ def analyze(
with multiprocessing.Pool(proc) as p:
list(tqdm(p.imap(plot_partial, filt_res), total=len(filt_res)))

df = utils.pandas_processing(res_dict=filt_res, outdir=outdir)
df_pydamage = utils.pandas_processing(res_dict=filt_res)

acc_model = load_model()
print(acc_model)
prep_df = prepare_data(df)
print(prep_df)
print(fit_model(prep_df, acc_model))
prep_df_glm = prepare_data(df_pydamage)
df_glm = fit_model(prep_df_glm, acc_model)

df = df_pydamage.merge(df_glm, left_index=True, right_index=True)

utils.df_to_csv(df, outdir)
return df
119 changes: 0 additions & 119 deletions pydamage/models/accuracy_model.xml

This file was deleted.

Binary file added pydamage/models/glm_accuracy_model.pickle
Binary file not shown.
14 changes: 11 additions & 3 deletions pydamage/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,11 @@ def makedir(dirpath, confirm=True, force=False):
os.makedirs(dirpath)


def pandas_processing(res_dict, outdir):
def pandas_processing(res_dict):
"""Performs Pandas processing of Pydamage results
Args:
res_dict (dict): Result dictionary of Vuong's closeness test
outdir (str): Path to output directory
"""
df = pd.DataFrame(res_dict)
if len(res_dict) == 0:
Expand Down Expand Up @@ -101,10 +100,19 @@ def pandas_processing(res_dict, outdir):
df.set_index("reference", inplace=True)
df.dropna(axis=1, how="all", inplace=True)
df = df.round(3)
df.to_csv(f"{outdir}/pydamage_results.csv")
return df


def df_to_csv(df, outdir):
"""Write Pydamage results to disk
Args:
df(pandas DataFrame): Pydamage results DataFrame
outdir (str): Path to output directory
"""
df.to_csv(f"{outdir}/pydamage_results.csv")


def sort_dict_by_keys(adict):
"""Sort dictonary by keys
Expand Down
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,10 @@ def get_version(rel_path):
"statsmodels",
"matplotlib",
"tqdm",
"pypmml",
"biopython",
],
packages=find_packages(include=["pydamage"]),
entry_points={"console_scripts": ["pydamage = pydamage.cli:cli"]},
include_package_data=True,
package_data={"": ["models/accuracy_model.xml"]},
package_data={"": ["models/glm_accuracy_model.pickle"]},
)

0 comments on commit 04e9727

Please sign in to comment.