diff --git a/tests/test_read_api.py b/tests/test_read_api.py index 536a706..2c8ecbc 100644 --- a/tests/test_read_api.py +++ b/tests/test_read_api.py @@ -386,4 +386,4 @@ def test_dataset_feature_weights(client): def test_dataset_default_feature_weights(client): resp = client.get("/datasets/feature_weights/default") assert resp.status_code == 200 - assert resp.json == client.application.df["feature_weights"]["default_id"] + assert resp.json == client.application.datasets["feature_weights"]["default_id"] diff --git a/tests/test_train.py b/tests/test_train.py index f7b53df..8933fe9 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -46,9 +46,9 @@ def myapp(): def data(myapp): np.random.seed(1000) from validator.validate_api import bad_vocab, common_vocab, get_question_data - df = myapp.df + datasets = myapp.datasets with myapp.app_context(): - question_data = df["questions"][df["questions"]["uid"] == "9@7"].iloc[0] + question_data = datasets["questions"][datasets["questions"]["uid"] == "9@7"].iloc[0] stem_vocab = question_data["stem_words"] mc_vocab = question_data["mc_words"] vocab_set = get_question_data(question_data.uid)[0] diff --git a/validator/app.py b/validator/app.py index 20f6417..10bc676 100755 --- a/validator/app.py +++ b/validator/app.py @@ -44,16 +44,16 @@ def create_app(**kwargs): ) feature_weights["default_id"] = feature_weights_key - df = {} - df["innovation"] = df_innovation_ - df["domain"] = df_domain_ - df["questions"] = df_questions_ - df["feature_weights"] = feature_weights - app.df = df + datasets = {} + datasets["innovation"] = df_innovation_ + datasets["domain"] = df_domain_ + datasets["questions"] = df_questions_ + datasets["feature_weights"] = feature_weights + app.datasets = datasets app.qids = {} for idcol in ("uid", "qid"): - app.qids[idcol] = set(df["questions"][idcol].values.tolist()) + app.qids[idcol] = set(datasets["questions"][idcol].values.tolist()) app.register_blueprint(read_api.bp) app.register_blueprint(write_api.bp) diff --git a/validator/read_api.py b/validator/read_api.py index 9899785..3220848 100755 --- a/validator/read_api.py +++ b/validator/read_api.py @@ -44,7 +44,7 @@ def datasets_index(): def _books_json(include_vocabs=True): - data = current_app.df["domain"][["book_name", "vuid"]].rename( + data = current_app.datasets["domain"][["book_name", "vuid"]].rename( {"book_name": "name"}, axis=1 ) if include_vocabs: @@ -84,7 +84,7 @@ def books_index(): @bp.route("/datasets/books/") def fetch_book(vuid): - df = current_app.df + df = current_app.datasets data = df["domain"][df["domain"]["vuid"] == vuid][["book_name", "vuid"]].rename( {"book_name": "name"}, axis=1 ) @@ -105,7 +105,7 @@ def fetch_book(vuid): @bp.route("/datasets/books//pages") def fetch_page_list(vuid): - df = current_app.df + df = current_app.datasets book = df["innovation"][df["innovation"]["cvuid"].str.startswith(vuid)] if book.empty: _validate_vuid(vuid) @@ -117,7 +117,7 @@ def fetch_page_list(vuid): @bp.route("/datasets/books//pages/") def fetch_page(vuid, pvuid): - df = current_app.df + df = current_app.datasets innovation = df["innovation"][df["innovation"]["cvuid"] == ":".join((vuid, pvuid))][ "innovation_words" ] @@ -151,7 +151,7 @@ def fetch_vocabs(vuid): @bp.route("/datasets/books//vocabularies/domain") def fetch_domain(vuid): - df = current_app.df + df = current_app.datasets data = df["domain"][df["domain"]["vuid"] == vuid]["domain_words"] if data.empty: _validate_vuid(vuid) @@ -162,7 +162,7 @@ def fetch_domain(vuid): @bp.route("/datasets/books//vocabularies/innovation") def fetch_innovation(vuid): - df = current_app.df + df = current_app.datasets data = df["innovation"][df["innovation"]["cvuid"].str.startswith(vuid)][ ["cvuid", "innovation_words"] ] @@ -177,7 +177,7 @@ def fetch_innovation(vuid): @bp.route("/datasets/books//vocabularies/innovation/") def fetch_page_innovation(vuid, pvuid): - df = current_app.df + df = current_app.datasets data = df["innovation"][df["innovation"]["cvuid"] == ":".join((vuid, pvuid))][ "innovation_words" ] @@ -191,7 +191,7 @@ def fetch_page_innovation(vuid, pvuid): @bp.route("/datasets/books//vocabularies/questions") def fetch_questions(vuid): - df = current_app.df + df = current_app.datasets data = df["questions"][df["questions"]["cvuid"].str.startswith(vuid)].rename( {"uid": "exercise_uid", "mc_words": "option_words"}, axis=1 ) @@ -217,7 +217,7 @@ def fetch_questions(vuid): @bp.route("/datasets/books//vocabularies/questions/") def fetch_page_questions(vuid, pvuid): - df = current_app.df + df = current_app.datasets data = df["questions"][df["questions"]["cvuid"] == ":".join((vuid, pvuid))].rename( {"uid": "exercise_uid", "mc_words": "option_words"}, axis=1 ) @@ -241,12 +241,12 @@ def fetch_page_questions(vuid, pvuid): @bp.route("/datasets/questions") def questions_index(): - return jsonify(current_app.df["questions"].uid.tolist()) + return jsonify(current_app.datasets["questions"].uid.tolist()) @bp.route("/datasets/questions/") def fetch_question(uid): - df = current_app.df + df = current_app.datasets data = df["questions"][df["questions"]["uid"] == uid].rename( {"uid": "exercise_uid", "mc_words": "option_words"}, axis=1 ) @@ -259,7 +259,7 @@ def fetch_question(uid): @bp.route("/datasets/feature_weights") def feature_weights_index(): - fw_ids = list(current_app.df["feature_weights"].keys()) + fw_ids = list(current_app.datasets["feature_weights"].keys()) fw_ids.remove("default_id") return jsonify(fw_ids) @@ -267,7 +267,7 @@ def feature_weights_index(): @bp.route("/datasets/feature_weights/") def fetch_feature_weights(fw_id): _validate_uuid(fw_id) - df = current_app.df + df = current_app.datasets try: data = df["feature_weights"][fw_id] except KeyError: @@ -278,7 +278,7 @@ def fetch_feature_weights(fw_id): @bp.route("/datasets/feature_weights/default") def fetch_default_feature_weights(): - return jsonify(current_app.df["feature_weights"]["default_id"]) + return jsonify(current_app.datasets["feature_weights"]["default_id"]) @bp.route("/ping") @@ -290,9 +290,8 @@ def ping(): def status(): global start_time data = {"version": _version.get_versions(), "started": start_time} - - if "vuid" in current_app.df["domain"].columns: - fw_ids = list(current_app.df["feature_weights"].keys()) + if "vuid" in current_app.datasets["domain"].columns: + fw_ids = list(current_app.datasets["feature_weights"].keys()) fw_ids.remove("default_id") data["datasets"] = { "books": _books_json(include_vocabs=False), @@ -304,13 +303,13 @@ def status(): @bp.route("/status/defaults/feature_weights_id") def fetch_default_feature_weights_id(): - return jsonify(current_app.df["feature_weights"]["default_id"]) + return jsonify(current_app.datasets["feature_weights"]["default_id"]) @bp.route("/status/defaults") def fetch_default_feature_weights_set(): - default_id = current_app.df["feature_weights"]["default_id"] - return jsonify(current_app.df["feature_weights"][default_id]) + default_id = current_app.datasets["feature_weights"]["default_id"] + return jsonify(current_app.datasets["feature_weights"][default_id]) @bp.route("/version") diff --git a/validator/training_api.py b/validator/training_api.py index c57b4df..b0da023 100755 --- a/validator/training_api.py +++ b/validator/training_api.py @@ -64,7 +64,7 @@ def validation_train(): # Temp install of weights temp_fw_id = f"training-{uuid4()}" - current_app.df["feature_weights"][temp_fw_id] = train_feature_dict + current_app.datasets["feature_weights"][temp_fw_id] = train_feature_dict output_df = response_df.apply( lambda x: validate_response( @@ -76,7 +76,7 @@ def validation_train(): output_df["valid_label"] = response_df["valid_label"] # remove temp weights - current_app.df["feature_weights"].pop(temp_fw_id) + current_app.datasets["feature_weights"].pop(temp_fw_id) # Do an N-fold cross validation if cv > 1. # Then get coefficients/intercept for the entire dataset diff --git a/validator/validate_api.py b/validator/validate_api.py index a69b4e6..ab0324b 100755 --- a/validator/validate_api.py +++ b/validator/validate_api.py @@ -70,21 +70,21 @@ def setup_parse_and_data(setup_state): def get_question_data_by_key(key, val): - df = current_app.df + datasets = current_app.datasets # FIXME - should use all the questions and combine associated pages # FIXME - last_q works better because of some dirty data getting through # that has innovation pages but not the exact book those pages are from - last_q = df["questions"][df["questions"][key] == val].iloc[-1] + last_q = datasets["questions"][datasets["questions"][key] == val].iloc[-1] module_id = last_q.cvuid uid = last_q.uid has_numeric = bool(last_q.contains_number) innovation_vocab = ( - df["innovation"][df["innovation"]["cvuid"] == module_id] + datasets["innovation"][datasets["innovation"]["cvuid"] == module_id] .iloc[0] .innovation_words ) vuid = module_id.split(":")[0] - domain_vocab_df = df["domain"][df["domain"]["vuid"] == vuid] + domain_vocab_df = datasets["domain"][datasets["domain"]["vuid"] == vuid] if domain_vocab_df.empty: domain_vocab = set() else: @@ -153,7 +153,7 @@ def parse_and_classify( ) # Fetch feature weights by ID - feature_weight_dict = current_app.df["feature_weights"][feature_weights_id] + feature_weight_dict = current_app.datasets["feature_weights"][feature_weights_id] # Initialize all feature counts to 0 # ORDER OF KEYS in feature_weight_dict is preserved, and matters! @@ -215,7 +215,7 @@ def validate_response( if lazy_math_mode is None: lazy_math_mode = PARSER_DEFAULTS["lazy_math_mode"] if feature_weights_id is None: - feature_weights_id = current_app.df["feature_weights"]["default_id"] + feature_weights_id = current_app.datasets["feature_weights"]["default_id"] # Try to get questions-specific vocab via uid (if not found, vocab will be empty) # domain_vocab, innovation_vocab, has_numeric, uid_used, question_vocab, @@ -305,10 +305,10 @@ def validation_api_entry(): response = args.get("response", None) uid = args.get("uid", None) feature_weights_set_id = args.get( - "feature_weights_set_id", current_app.df["feature_weights"]["default_id"] + "feature_weights_set_id", current_app.datasets["feature_weights"]["default_id"] ) - if feature_weights_set_id not in current_app.df["feature_weights"]: + if feature_weights_set_id not in current_app.datasets["feature_weights"]: raise InvalidUsage("feature_weights_set_id not found", status_code=404) parser_params = { @@ -321,7 +321,7 @@ def validation_api_entry(): response, uid, feature_weights_id=feature_weights_set_id, **parser_params ) - return_dictionary["feature_weights"] = current_app.df["feature_weights"][ + return_dictionary["feature_weights"] = current_app.datasets["feature_weights"][ feature_weights_set_id ] diff --git a/validator/write_api.py b/validator/write_api.py index 0166fd8..ebd309f 100755 --- a/validator/write_api.py +++ b/validator/write_api.py @@ -32,55 +32,55 @@ def update_fixed_data(df_domain_, df_innovation_, df_questions_): # AEW: I feel like I am sinning against nature here . . . # Do we need to store these in a Redis cache or db??? # This was all well and good before we ever tried to modify things - df = current_app.df + datasets = current_app.datasets # Remove any entries from the domain, innovation, and question dataframes # that are duplicated by the new data book_id = df_domain_.iloc[0]["vuid"] - if "vuid" in df["domain"].columns: - df["domain"] = df["domain"][df["domain"]["vuid"] != book_id] - if "cvuid" in df["domain"].columns: - df["innovation"] = df["innovation"][ - ~(df["innovation"]["cvuid"].star.startswith(book_id)) + if "vuid" in datasets["domain"].columns: + datasets["domain"] = datasets["domain"][datasets["domain"]["vuid"] != book_id] + if "cvuid" in datasets["domain"].columns: + datasets["innovation"] = datasets["innovation"][ + ~(datasets["innovation"]["cvuid"].star.startswith(book_id)) ] uids = df_questions_["uid"].unique() - if "uid" in df["questions"].columns: - df["questions"] = df["questions"][ + if "uid" in datasets["questions"].columns: + datasets["questions"] = datasets["questions"][ ~( - df["questions"]["uid"].isin(uids) - & df["questions"]["cvuid"].str.startswith(book_id) + datasets["questions"]["uid"].isin(uids) + & datasets["questions"]["cvuid"].str.startswith(book_id) ) ] # Now append the new dataframes to the in-memory ones - df["domain"] = df["domain"].append(df_domain_, sort=False) - df["innovation"] = df["innovation"].append(df_innovation_, sort=False) - df["questions"] = df["questions"].append(df_questions_, sort=False) + datasets["domain"] = datasets["domain"].append(df_domain_, sort=False) + datasets["innovation"] = datasets["innovation"].append(df_innovation_, sort=False) + datasets["questions"] = datasets["questions"].append(df_questions_, sort=False) # Update qid sets - for shortcutting question lookup for idcol in ("uid", "qid"): - current_app.qids[idcol] = set(df["questions"][idcol].values.tolist()) + current_app.qids[idcol] = set(datasets["questions"][idcol].values.tolist()) # Finally, write the updated dataframes to disk and declare victory data_dir = current_app.config["DATA_DIR"] - write_fixed_data(df["domain"], df["innovation"], df["questions"], data_dir) + write_fixed_data(datasets["domain"], datasets["innovation"], datasets["questions"], data_dir) def store_feature_weights(new_feature_weights): # Allows removing duplicate sets in feature weights # Sees if the incoming set matches with fw set - df = current_app.df - for fw_id, existing_feature_weights in df["feature_weights"].items(): + datasets = current_app.datasets + for fw_id, existing_feature_weights in datasets["feature_weights"].items(): if existing_feature_weights == new_feature_weights: result_id = fw_id break else: result_id = uuid.uuid4() - df["feature_weights"][str(result_id)] = new_feature_weights + datasets["feature_weights"][str(result_id)] = new_feature_weights data_dir = current_app.config["DATA_DIR"] - write_feature_weights(df["feature_weights"], data_dir) + write_feature_weights(datasets["feature_weights"], data_dir) return result_id @@ -89,15 +89,15 @@ def write_default_feature_weights_id(new_default_id): # Allows removing duplicate sets in feature weights # Sees if the incoming set matches with fw set - df = current_app.df + datasets = current_app.datasets - if new_default_id == df["feature_weights"]["default_id"]: + if new_default_id == datasets["feature_weights"]["default_id"]: return new_default_id else: - df["feature_weights"]["default_id"] = new_default_id + datasets["feature_weights"]["default_id"] = new_default_id data_dir = current_app.config["DATA_DIR"] - write_feature_weights(df["feature_weights"], data_dir) + write_feature_weights(datasets["feature_weights"], data_dir) return new_default_id @@ -185,14 +185,14 @@ def new_feature_weights_set(): @bp.route("/datasets/feature_weights/default", methods=["PUT"]) @cross_origin(supports_credentials=True) def set_default_feature_weights_id(): - df = current_app.df + datasets = current_app.datasets if not request.is_json: raise InvalidUsage( "Unable to load new default id as json file.", status_code=404 ) else: new_default_id = request.json - if new_default_id not in df["feature_weights"].keys(): + if new_default_id not in datasets["feature_weights"].keys(): raise InvalidUsage("Feature weight id not found.", status_code=400) default_id = write_default_feature_weights_id(new_default_id) return jsonify(