-
Notifications
You must be signed in to change notification settings - Fork 84
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #80 from nasaharvest/compare-ceo-labels
Version 1 notebook for comparing CEO labels
- Loading branch information
Showing
1 changed file
with
156 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "b0a5e2f7", | ||
"metadata": {}, | ||
"source": [ | ||
"# Comparing CEO labels 🏷️\n", | ||
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nasaharvest/openmapflow/blob/main/openmapflow/notebooks/compare_ceo_labels.ipynb)\n", | ||
"\n", | ||
"**Description:** This notebook provides code to compare labels of the same data points from two Collect Earth Online (CEO) projects." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "aaa920bf", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "0b8dc8ba", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Load the CSV label files\n", | ||
"ceo_set1_path = # Path to your first csv label file, \n", | ||
" # e.g., ceo-Hawaii-Jan-Dec-2020-(Set-1)-sample-data-2022-08-16.csv\n", | ||
"ceo_set2_path = # Path to your second csv label file, \n", | ||
" # e.g., ceo-Hawaii-Jan-Dec-2020-(Set-2)-sample-data-2022-08-16.csv\n", | ||
"\n", | ||
"ceo_set1 = pd.read_csv(ceo_set1_path)\n", | ||
"ceo_set2 = pd.read_csv(ceo_set2_path)\n", | ||
"\n", | ||
"if ceo_set1.shape != ceo_set2.shape:\n", | ||
" print('''ERROR: The size of the two dataframes does not match. \n", | ||
" Most likely, there is a duplicate in the plotid column \n", | ||
" resulting from an error in CEO. You need to delete the \n", | ||
" duplicate manually before continuing.''')\n", | ||
" print(ceo_set1[ceo_set1.duplicated(subset=['plotid'])])\n", | ||
" print(ceo_set2[ceo_set2.duplicated(subset=['plotid'])])\n", | ||
"else:\n", | ||
" print('Loaded two dataframes with equal size: {}'.format(ceo_set1.shape))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "2fc058b0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Sometimes there are slight variations in the labeling question used, \n", | ||
"# so we get this from the question column\n", | ||
"label_question = ceo_set1.columns[-1]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "8905f8e4", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"ceo_agree = ceo_set1[ceo_set1[label_question] == ceo_set2[label_question]]\n", | ||
"\n", | ||
"print('Number of samples that are in agreement: %d out of %d (%.2f%%)' % \n", | ||
" (ceo_agree.shape[0], \n", | ||
" ceo_set1.shape[0], \n", | ||
" ceo_agree.shape[0]/ceo_set1.shape[0]*100))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "720c549b", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"ceo_disagree_set1 = ceo_set1[ceo_set1[label_question] != ceo_set2[label_question]]\n", | ||
"ceo_disagree_set2 = ceo_set2[ceo_set1[label_question] != ceo_set2[label_question]]\n", | ||
"\n", | ||
"print('Number of samples that are NOT in agreement: %d out of %d (%.2f%%)' % \n", | ||
" (ceo_disagree_set1.shape[0], \n", | ||
" ceo_set1.shape[0], \n", | ||
" ceo_disagree_set1.shape[0]/ceo_set1.shape[0]*100))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "513542da", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"pd.set_option('display.max_rows', None)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "1275ed44", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"ceo_disagree_set1[['sampleid', 'email', 'flagged', 'collection_time', \n", | ||
" 'analysis_duration', 'imagery_title', label_question]]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "6523cce5", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"ceo_disagree_set2[['sampleid', 'email', 'flagged', 'collection_time', \n", | ||
" 'analysis_duration', 'imagery_title', label_question]]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "97be64a4", | ||
"metadata": {}, | ||
"source": [ | ||
"The above tables show the points from each of the two sets for which labelers disagreed on the assigned label. Review these as a group and determine which label should be assigned by consensus. " | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.15" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |