Skip to content

Commit

Permalink
Merge pull request #30 from openstax/fw_per_book
Browse files Browse the repository at this point in the history
7.29 Renaming df to datasets, ready for review.
  • Loading branch information
reedstrm committed Jul 29, 2020
2 parents a4e7451 + 75aae11 commit 4bf73c7
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 66 deletions.
2 changes: 1 addition & 1 deletion tests/test_read_api.py
Expand Up @@ -386,4 +386,4 @@ def test_dataset_feature_weights(client):
def test_dataset_default_feature_weights(client):
resp = client.get("/datasets/feature_weights/default")
assert resp.status_code == 200
assert resp.json == client.application.df["feature_weights"]["default_id"]
assert resp.json == client.application.datasets["feature_weights"]["default_id"]
4 changes: 2 additions & 2 deletions tests/test_train.py
Expand Up @@ -46,9 +46,9 @@ def myapp():
def data(myapp):
np.random.seed(1000)
from validator.validate_api import bad_vocab, common_vocab, get_question_data
df = myapp.df
datasets = myapp.datasets
with myapp.app_context():
question_data = df["questions"][df["questions"]["uid"] == "9@7"].iloc[0]
question_data = datasets["questions"][datasets["questions"]["uid"] == "9@7"].iloc[0]
stem_vocab = question_data["stem_words"]
mc_vocab = question_data["mc_words"]
vocab_set = get_question_data(question_data.uid)[0]
Expand Down
14 changes: 7 additions & 7 deletions validator/app.py
Expand Up @@ -44,16 +44,16 @@ def create_app(**kwargs):
)
feature_weights["default_id"] = feature_weights_key

df = {}
df["innovation"] = df_innovation_
df["domain"] = df_domain_
df["questions"] = df_questions_
df["feature_weights"] = feature_weights
app.df = df
datasets = {}
datasets["innovation"] = df_innovation_
datasets["domain"] = df_domain_
datasets["questions"] = df_questions_
datasets["feature_weights"] = feature_weights
app.datasets = datasets

app.qids = {}
for idcol in ("uid", "qid"):
app.qids[idcol] = set(df["questions"][idcol].values.tolist())
app.qids[idcol] = set(datasets["questions"][idcol].values.tolist())

app.register_blueprint(read_api.bp)
app.register_blueprint(write_api.bp)
Expand Down
39 changes: 19 additions & 20 deletions validator/read_api.py
Expand Up @@ -44,7 +44,7 @@ def datasets_index():


def _books_json(include_vocabs=True):
data = current_app.df["domain"][["book_name", "vuid"]].rename(
data = current_app.datasets["domain"][["book_name", "vuid"]].rename(
{"book_name": "name"}, axis=1
)
if include_vocabs:
Expand Down Expand Up @@ -84,7 +84,7 @@ def books_index():

@bp.route("/datasets/books/<vuid>")
def fetch_book(vuid):
df = current_app.df
df = current_app.datasets
data = df["domain"][df["domain"]["vuid"] == vuid][["book_name", "vuid"]].rename(
{"book_name": "name"}, axis=1
)
Expand All @@ -105,7 +105,7 @@ def fetch_book(vuid):

@bp.route("/datasets/books/<vuid>/pages")
def fetch_page_list(vuid):
df = current_app.df
df = current_app.datasets
book = df["innovation"][df["innovation"]["cvuid"].str.startswith(vuid)]
if book.empty:
_validate_vuid(vuid)
Expand All @@ -117,7 +117,7 @@ def fetch_page_list(vuid):

@bp.route("/datasets/books/<vuid>/pages/<pvuid>")
def fetch_page(vuid, pvuid):
df = current_app.df
df = current_app.datasets
innovation = df["innovation"][df["innovation"]["cvuid"] == ":".join((vuid, pvuid))][
"innovation_words"
]
Expand Down Expand Up @@ -151,7 +151,7 @@ def fetch_vocabs(vuid):

@bp.route("/datasets/books/<vuid>/vocabularies/domain")
def fetch_domain(vuid):
df = current_app.df
df = current_app.datasets
data = df["domain"][df["domain"]["vuid"] == vuid]["domain_words"]
if data.empty:
_validate_vuid(vuid)
Expand All @@ -162,7 +162,7 @@ def fetch_domain(vuid):

@bp.route("/datasets/books/<vuid>/vocabularies/innovation")
def fetch_innovation(vuid):
df = current_app.df
df = current_app.datasets
data = df["innovation"][df["innovation"]["cvuid"].str.startswith(vuid)][
["cvuid", "innovation_words"]
]
Expand All @@ -177,7 +177,7 @@ def fetch_innovation(vuid):

@bp.route("/datasets/books/<vuid>/vocabularies/innovation/<pvuid>")
def fetch_page_innovation(vuid, pvuid):
df = current_app.df
df = current_app.datasets
data = df["innovation"][df["innovation"]["cvuid"] == ":".join((vuid, pvuid))][
"innovation_words"
]
Expand All @@ -191,7 +191,7 @@ def fetch_page_innovation(vuid, pvuid):

@bp.route("/datasets/books/<vuid>/vocabularies/questions")
def fetch_questions(vuid):
df = current_app.df
df = current_app.datasets
data = df["questions"][df["questions"]["cvuid"].str.startswith(vuid)].rename(
{"uid": "exercise_uid", "mc_words": "option_words"}, axis=1
)
Expand All @@ -217,7 +217,7 @@ def fetch_questions(vuid):

@bp.route("/datasets/books/<vuid>/vocabularies/questions/<pvuid>")
def fetch_page_questions(vuid, pvuid):
df = current_app.df
df = current_app.datasets
data = df["questions"][df["questions"]["cvuid"] == ":".join((vuid, pvuid))].rename(
{"uid": "exercise_uid", "mc_words": "option_words"}, axis=1
)
Expand All @@ -241,12 +241,12 @@ def fetch_page_questions(vuid, pvuid):

@bp.route("/datasets/questions")
def questions_index():
return jsonify(current_app.df["questions"].uid.tolist())
return jsonify(current_app.datasets["questions"].uid.tolist())


@bp.route("/datasets/questions/<uid>")
def fetch_question(uid):
df = current_app.df
df = current_app.datasets
data = df["questions"][df["questions"]["uid"] == uid].rename(
{"uid": "exercise_uid", "mc_words": "option_words"}, axis=1
)
Expand All @@ -259,15 +259,15 @@ def fetch_question(uid):

@bp.route("/datasets/feature_weights")
def feature_weights_index():
fw_ids = list(current_app.df["feature_weights"].keys())
fw_ids = list(current_app.datasets["feature_weights"].keys())
fw_ids.remove("default_id")
return jsonify(fw_ids)


@bp.route("/datasets/feature_weights/<fw_id>")
def fetch_feature_weights(fw_id):
_validate_uuid(fw_id)
df = current_app.df
df = current_app.datasets
try:
data = df["feature_weights"][fw_id]
except KeyError:
Expand All @@ -278,7 +278,7 @@ def fetch_feature_weights(fw_id):

@bp.route("/datasets/feature_weights/default")
def fetch_default_feature_weights():
return jsonify(current_app.df["feature_weights"]["default_id"])
return jsonify(current_app.datasets["feature_weights"]["default_id"])


@bp.route("/ping")
Expand All @@ -290,9 +290,8 @@ def ping():
def status():
global start_time
data = {"version": _version.get_versions(), "started": start_time}

if "vuid" in current_app.df["domain"].columns:
fw_ids = list(current_app.df["feature_weights"].keys())
if "vuid" in current_app.datasets["domain"].columns:
fw_ids = list(current_app.datasets["feature_weights"].keys())
fw_ids.remove("default_id")
data["datasets"] = {
"books": _books_json(include_vocabs=False),
Expand All @@ -304,13 +303,13 @@ def status():

@bp.route("/status/defaults/feature_weights_id")
def fetch_default_feature_weights_id():
return jsonify(current_app.df["feature_weights"]["default_id"])
return jsonify(current_app.datasets["feature_weights"]["default_id"])


@bp.route("/status/defaults")
def fetch_default_feature_weights_set():
default_id = current_app.df["feature_weights"]["default_id"]
return jsonify(current_app.df["feature_weights"][default_id])
default_id = current_app.datasets["feature_weights"]["default_id"]
return jsonify(current_app.datasets["feature_weights"][default_id])


@bp.route("/version")
Expand Down
4 changes: 2 additions & 2 deletions validator/training_api.py
Expand Up @@ -64,7 +64,7 @@ def validation_train():
# Temp install of weights

temp_fw_id = f"training-{uuid4()}"
current_app.df["feature_weights"][temp_fw_id] = train_feature_dict
current_app.datasets["feature_weights"][temp_fw_id] = train_feature_dict

output_df = response_df.apply(
lambda x: validate_response(
Expand All @@ -76,7 +76,7 @@ def validation_train():
output_df["valid_label"] = response_df["valid_label"]

# remove temp weights
current_app.df["feature_weights"].pop(temp_fw_id)
current_app.datasets["feature_weights"].pop(temp_fw_id)

# Do an N-fold cross validation if cv > 1.
# Then get coefficients/intercept for the entire dataset
Expand Down
18 changes: 9 additions & 9 deletions validator/validate_api.py
Expand Up @@ -70,21 +70,21 @@ def setup_parse_and_data(setup_state):


def get_question_data_by_key(key, val):
df = current_app.df
datasets = current_app.datasets
# FIXME - should use all the questions and combine associated pages
# FIXME - last_q works better because of some dirty data getting through
# that has innovation pages but not the exact book those pages are from
last_q = df["questions"][df["questions"][key] == val].iloc[-1]
last_q = datasets["questions"][datasets["questions"][key] == val].iloc[-1]
module_id = last_q.cvuid
uid = last_q.uid
has_numeric = bool(last_q.contains_number)
innovation_vocab = (
df["innovation"][df["innovation"]["cvuid"] == module_id]
datasets["innovation"][datasets["innovation"]["cvuid"] == module_id]
.iloc[0]
.innovation_words
)
vuid = module_id.split(":")[0]
domain_vocab_df = df["domain"][df["domain"]["vuid"] == vuid]
domain_vocab_df = datasets["domain"][datasets["domain"]["vuid"] == vuid]
if domain_vocab_df.empty:
domain_vocab = set()
else:
Expand Down Expand Up @@ -153,7 +153,7 @@ def parse_and_classify(
)

# Fetch feature weights by ID
feature_weight_dict = current_app.df["feature_weights"][feature_weights_id]
feature_weight_dict = current_app.datasets["feature_weights"][feature_weights_id]

# Initialize all feature counts to 0
# ORDER OF KEYS in feature_weight_dict is preserved, and matters!
Expand Down Expand Up @@ -215,7 +215,7 @@ def validate_response(
if lazy_math_mode is None:
lazy_math_mode = PARSER_DEFAULTS["lazy_math_mode"]
if feature_weights_id is None:
feature_weights_id = current_app.df["feature_weights"]["default_id"]
feature_weights_id = current_app.datasets["feature_weights"]["default_id"]

# Try to get questions-specific vocab via uid (if not found, vocab will be empty)
# domain_vocab, innovation_vocab, has_numeric, uid_used, question_vocab,
Expand Down Expand Up @@ -305,10 +305,10 @@ def validation_api_entry():
response = args.get("response", None)
uid = args.get("uid", None)
feature_weights_set_id = args.get(
"feature_weights_set_id", current_app.df["feature_weights"]["default_id"]
"feature_weights_set_id", current_app.datasets["feature_weights"]["default_id"]
)

if feature_weights_set_id not in current_app.df["feature_weights"]:
if feature_weights_set_id not in current_app.datasets["feature_weights"]:
raise InvalidUsage("feature_weights_set_id not found", status_code=404)

parser_params = {
Expand All @@ -321,7 +321,7 @@ def validation_api_entry():
response, uid, feature_weights_id=feature_weights_set_id, **parser_params
)

return_dictionary["feature_weights"] = current_app.df["feature_weights"][
return_dictionary["feature_weights"] = current_app.datasets["feature_weights"][
feature_weights_set_id
]

Expand Down
50 changes: 25 additions & 25 deletions validator/write_api.py
Expand Up @@ -32,55 +32,55 @@ def update_fixed_data(df_domain_, df_innovation_, df_questions_):
# AEW: I feel like I am sinning against nature here . . .
# Do we need to store these in a Redis cache or db???
# This was all well and good before we ever tried to modify things
df = current_app.df
datasets = current_app.datasets

# Remove any entries from the domain, innovation, and question dataframes
# that are duplicated by the new data
book_id = df_domain_.iloc[0]["vuid"]
if "vuid" in df["domain"].columns:
df["domain"] = df["domain"][df["domain"]["vuid"] != book_id]
if "cvuid" in df["domain"].columns:
df["innovation"] = df["innovation"][
~(df["innovation"]["cvuid"].star.startswith(book_id))
if "vuid" in datasets["domain"].columns:
datasets["domain"] = datasets["domain"][datasets["domain"]["vuid"] != book_id]
if "cvuid" in datasets["domain"].columns:
datasets["innovation"] = datasets["innovation"][
~(datasets["innovation"]["cvuid"].star.startswith(book_id))
]
uids = df_questions_["uid"].unique()
if "uid" in df["questions"].columns:
df["questions"] = df["questions"][
if "uid" in datasets["questions"].columns:
datasets["questions"] = datasets["questions"][
~(
df["questions"]["uid"].isin(uids)
& df["questions"]["cvuid"].str.startswith(book_id)
datasets["questions"]["uid"].isin(uids)
& datasets["questions"]["cvuid"].str.startswith(book_id)
)
]

# Now append the new dataframes to the in-memory ones
df["domain"] = df["domain"].append(df_domain_, sort=False)
df["innovation"] = df["innovation"].append(df_innovation_, sort=False)
df["questions"] = df["questions"].append(df_questions_, sort=False)
datasets["domain"] = datasets["domain"].append(df_domain_, sort=False)
datasets["innovation"] = datasets["innovation"].append(df_innovation_, sort=False)
datasets["questions"] = datasets["questions"].append(df_questions_, sort=False)

# Update qid sets - for shortcutting question lookup
for idcol in ("uid", "qid"):
current_app.qids[idcol] = set(df["questions"][idcol].values.tolist())
current_app.qids[idcol] = set(datasets["questions"][idcol].values.tolist())

# Finally, write the updated dataframes to disk and declare victory
data_dir = current_app.config["DATA_DIR"]
write_fixed_data(df["domain"], df["innovation"], df["questions"], data_dir)
write_fixed_data(datasets["domain"], datasets["innovation"], datasets["questions"], data_dir)


def store_feature_weights(new_feature_weights):
# Allows removing duplicate sets in feature weights
# Sees if the incoming set matches with fw set

df = current_app.df
for fw_id, existing_feature_weights in df["feature_weights"].items():
datasets = current_app.datasets
for fw_id, existing_feature_weights in datasets["feature_weights"].items():

if existing_feature_weights == new_feature_weights:
result_id = fw_id
break
else:
result_id = uuid.uuid4()
df["feature_weights"][str(result_id)] = new_feature_weights
datasets["feature_weights"][str(result_id)] = new_feature_weights
data_dir = current_app.config["DATA_DIR"]
write_feature_weights(df["feature_weights"], data_dir)
write_feature_weights(datasets["feature_weights"], data_dir)

return result_id

Expand All @@ -89,15 +89,15 @@ def write_default_feature_weights_id(new_default_id):
# Allows removing duplicate sets in feature weights
# Sees if the incoming set matches with fw set

df = current_app.df
datasets = current_app.datasets

if new_default_id == df["feature_weights"]["default_id"]:
if new_default_id == datasets["feature_weights"]["default_id"]:
return new_default_id

else:
df["feature_weights"]["default_id"] = new_default_id
datasets["feature_weights"]["default_id"] = new_default_id
data_dir = current_app.config["DATA_DIR"]
write_feature_weights(df["feature_weights"], data_dir)
write_feature_weights(datasets["feature_weights"], data_dir)

return new_default_id

Expand Down Expand Up @@ -185,14 +185,14 @@ def new_feature_weights_set():
@bp.route("/datasets/feature_weights/default", methods=["PUT"])
@cross_origin(supports_credentials=True)
def set_default_feature_weights_id():
df = current_app.df
datasets = current_app.datasets
if not request.is_json:
raise InvalidUsage(
"Unable to load new default id as json file.", status_code=404
)
else:
new_default_id = request.json
if new_default_id not in df["feature_weights"].keys():
if new_default_id not in datasets["feature_weights"].keys():
raise InvalidUsage("Feature weight id not found.", status_code=400)
default_id = write_default_feature_weights_id(new_default_id)
return jsonify(
Expand Down

0 comments on commit 4bf73c7

Please sign in to comment.