Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions mapswipe_workers/mapswipe_workers/generate_stats/project_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from mapswipe_workers.generate_stats import (
project_stats_by_date,
tasking_manager_geometries,
user_stats,
)


Expand Down Expand Up @@ -321,6 +322,7 @@ def get_per_project_statistics(project_id: str, project_info: pd.Series) -> dict
tasks_filename = f"{DATA_PATH}/api/tasks/tasks_{project_id}.csv"
groups_filename = f"{DATA_PATH}/api/groups/groups_{project_id}.csv"
agg_results_filename = f"{DATA_PATH}/api/agg_results/agg_results_{project_id}.csv"
agg_results_by_user_id_filename = f"{DATA_PATH}/api/users/users_{project_id}.csv"
project_stats_by_date_filename = f"{DATA_PATH}/api/history/history_{project_id}.csv"

# load data from postgres or local storage if already downloaded
Expand All @@ -339,6 +341,17 @@ def get_per_project_statistics(project_id: str, project_info: pd.Series) -> dict
logger.info(f"saved agg results for {project_id}: {agg_results_filename}")
geojson_functions.csv_to_geojson(agg_results_filename, "geom")

# aggregate results by user id
agg_results_by_user_id_df = user_stats.get_agg_results_by_user_id(
results_df, agg_results_df
)
agg_results_by_user_id_df.to_csv(
agg_results_by_user_id_filename, index_label="idx"
)
logger.info(
f"saved agg results for {project_id}: {agg_results_by_user_id_filename}"
)

if any("maxar" in s for s in project_info["tile_server_names"]):
add_metadata_to_csv(agg_results_filename)
geojson_functions.add_metadata_to_geojson(agg_results_filename)
Expand Down
73 changes: 73 additions & 0 deletions mapswipe_workers/mapswipe_workers/generate_stats/user_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import pandas as pd


def calc_agreement_counts(row):
"""Calc number of agreeig and disagreeing results from other users."""
this_user_label = row["result"]
other_user_labels = []

for label in [0, 1, 2, 3]:
if label == this_user_label:
label_count = row[f"{label}_count"] - 1
else:
label_count = row[f"{label}_count"]
other_user_labels.extend([label] * label_count)

agreeing_contributions = other_user_labels.count(this_user_label)
disagreeing_contributions = len(other_user_labels) - agreeing_contributions

return agreeing_contributions, disagreeing_contributions


def calc_agreement_score(row):
"""Calc simple agreement score as share of agreeing contributions."""
agreement_score = row["agreeing_contributions"] / (
row["agreeing_contributions"] + row["disagreeing_contributions"]
)
return agreement_score


def get_agg_results_by_user_id(
results_df: pd.DataFrame, agg_results_df: pd.DataFrame
) -> pd.DataFrame:
"""
For each users we calcuate the number of total contributions (tasks)
and completed groups.
Then we compute agreeing and disagreeing contributions from other users.
This is the basis for a simple agreement score.
The agreement score tells you how often the results from other users
coincide with the results of this user. E.g 0.8 means, that 80% of the
results from other users are the same as the results for that user.
Returns a pandas dataframe.
"""
raw_contributions_df = results_df.merge(
agg_results_df, left_on="task_id", right_on="task_id"
)

# compare to classifications of other users
raw_contributions_df[
["agreeing_contributions", "disagreeing_contributions"]
] = raw_contributions_df.apply(
lambda row: calc_agreement_counts(row), axis=1, result_type="expand"
)

agg_results_by_user_id_df = raw_contributions_df.groupby(
["project_id", "user_id"]
).agg(
groups_completed=pd.NamedAgg(column="group_id", aggfunc=pd.Series.nunique),
total_contributions=pd.NamedAgg(column="user_id", aggfunc="count"),
agreeing_contributions=pd.NamedAgg(
column="agreeing_contributions", aggfunc="sum"
),
disagreeing_contributions=pd.NamedAgg(
column="disagreeing_contributions", aggfunc="sum"
),
)

agg_results_by_user_id_df[
"simple_agreement_score"
] = agg_results_by_user_id_df.apply(lambda row: calc_agreement_score(row), axis=1)

agg_results_by_user_id_df.reset_index(inplace=True)

return agg_results_by_user_id_df
2 changes: 2 additions & 0 deletions mapswipe_workers/mapswipe_workers/utils/create_directories.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ def create_directories() -> None:
DATA_PATH + "/api/results",
DATA_PATH + "/api/tasks",
DATA_PATH + "/api/yes_maybe",
DATA_PATH + "/api/users",
DATA_PATH + "/api/project_geometries",
)

for path in dirs:
Expand Down