From 5b9149a61a490488ec19ae99dd40c660fe23282e Mon Sep 17 00:00:00 2001 From: Ross Reedstrom Date: Mon, 28 Oct 2019 15:47:52 -0500 Subject: [PATCH] move qid lookup to app object, so it is updated by ecosystem import; make question/domain fetch robust to incomplete data --- validator/app.py | 4 ++++ validator/validate_api.py | 35 ++++++++++++++++++----------------- validator/write_api.py | 4 ++++ 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/validator/app.py b/validator/app.py index ab76f04..2bf4092 100755 --- a/validator/app.py +++ b/validator/app.py @@ -42,6 +42,10 @@ def create_app(**kwargs): app.df = df + app.qids = {} + for idcol in ("uid", "qid"): + app.qids[idcol] = set(df["questions"][idcol].values.tolist()) + app.register_blueprint(read_api.bp) app.register_blueprint(write_api.bp) app.register_blueprint(validate_api.bp) diff --git a/validator/validate_api.py b/validator/validate_api.py index 4a3ffb3..e366398 100755 --- a/validator/validate_api.py +++ b/validator/validate_api.py @@ -26,7 +26,6 @@ VALIDITY_FEATURE_DICT = {} PARSER_DEFAULTS = {} -qids = {} parser = None common_vocab = set() @@ -35,7 +34,7 @@ @bp.record_once def setup_parse_and_data(setup_state): - global VALIDITY_FEATURE_DICT, PARSER_DEFAULTS, qids, parser, common_vocab + global VALIDITY_FEATURE_DICT, PARSER_DEFAULTS, parser, common_vocab PARSER_DEFAULTS = setup_state.app.config["PARSER_DEFAULTS"] SPELLING_CORRECTION_DEFAULTS = setup_state.app.config[ @@ -43,11 +42,6 @@ def setup_parse_and_data(setup_state): ] VALIDITY_FEATURE_DICT = setup_state.app.config["VALIDITY_FEATURE_DICT"] - df = setup_state.app.df - qids = {} - for idcol in ("uid", "qid"): - qids[idcol] = set(df["questions"][idcol].values.tolist()) - # Create the parser, initially assign default values # (these can be overwritten during calls to process_string) parser = StaxStringProc( @@ -72,21 +66,28 @@ def setup_parse_and_data(setup_state): def get_question_data_by_key(key, val): df = current_app.df - first_q = df["questions"][df["questions"][key] == val].iloc[0] - module_id = first_q.cvuid - uid = first_q.uid - has_numeric = df["questions"][df["questions"][key] == val].iloc[0].contains_number + # FIXME - should use all the questions and combine associated pages + # FIXME - last_q works better because of some dirty data getting through + # that has innovation pages but not the exact book those pages are from + last_q = df["questions"][df["questions"][key] == val].iloc[-1] + module_id = last_q.cvuid + uid = last_q.uid + has_numeric = last_q.contains_number innovation_vocab = ( df["innovation"][df["innovation"]["cvuid"] == module_id] .iloc[0] .innovation_words ) vuid = module_id.split(":")[0] - domain_vocab = df["domain"][df["domain"]["vuid"] == vuid].iloc[0].domain_words + domain_vocab_df = df["domain"][df["domain"]["vuid"] == vuid] + if domain_vocab_df.empty: + domain_vocab = set() + else: + domain_vocab = domain_vocab_df.iloc[-1].domain_words # A better way . . . pre-process and then just to a lookup - question_vocab = first_q["stem_words"] - mc_vocab = first_q["mc_words"] + question_vocab = last_q["stem_words"] + mc_vocab = last_q["mc_words"] vocab_dict = OrderedDict( { "stem_word_count": question_vocab, @@ -105,9 +106,9 @@ def get_question_data_by_key(key, val): def get_question_data(uid): if uid is not None: qid = uid.split("@")[0] - if uid in qids["uid"]: + if uid in current_app.qids["uid"]: return get_question_data_by_key("uid", uid) - elif qid in qids["qid"]: + elif qid in current_app.qids["qid"]: return get_question_data_by_key("qid", qid) # no uid, or not in data sets default_vocab_dict = OrderedDict( @@ -254,7 +255,7 @@ def validate_response( return_dictionary["tag_numeric_input"] = tag_numeric_input return_dictionary["spelling_correction"] = spelling_correction return_dictionary["uid_used"] = uid_used - return_dictionary["uid_found"] = uid_used in qids["uid"] + return_dictionary["uid_found"] = uid_used in current_app.qids["uid"] return_dictionary["lazy_math_evaluation"] = lazy_math_mode # If lazy_math_mode, do a lazy math check and update valid accordingly diff --git a/validator/write_api.py b/validator/write_api.py index f7e908a..797c942 100755 --- a/validator/write_api.py +++ b/validator/write_api.py @@ -52,6 +52,10 @@ def update_fixed_data(df_domain_, df_innovation_, df_questions_): df["innovation"] = df["innovation"].append(df_innovation_, sort=False) df["questions"] = df["questions"].append(df_questions_, sort=False) + # Update qid sets - for shortcutting question lookup + for idcol in ("uid", "qid"): + current_app.qids[idcol] = set(df["questions"][idcol].values.tolist()) + # Finally, write the updated dataframes to disk and declare victory data_dir = current_app.config["DATA_DIR"] write_fixed_data(df["domain"], df["innovation"], df["questions"], data_dir)