Merge pull request #80 from nasaharvest/compare-ceo-labels

Version 1 notebook for comparing CEO labels
nasaharvest · Aug 20, 2022 · 324aba9 · 324aba9
2 parents 30f7642 + ec08cc9
commit 324aba9
Showing 1 changed file with 156 additions and 0 deletions.
diff --git a/openmapflow/notebooks/compare_ceo_labels.ipynb b/openmapflow/notebooks/compare_ceo_labels.ipynb
@@ -0,0 +1,156 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b0a5e2f7",
+   "metadata": {},
+   "source": [
+    "# Comparing CEO labels 🏷️\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nasaharvest/openmapflow/blob/main/openmapflow/notebooks/compare_ceo_labels.ipynb)\n",
+    "\n",
+    "**Description:** This notebook provides code to compare labels of the same data points from two Collect Earth Online (CEO) projects."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aaa920bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b8dc8ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the CSV label files\n",
+    "ceo_set1_path = # Path to your first csv label file, \n",
+    "                # e.g., ceo-Hawaii-Jan-Dec-2020-(Set-1)-sample-data-2022-08-16.csv\n",
+    "ceo_set2_path = # Path to your second csv label file, \n",
+    "                # e.g., ceo-Hawaii-Jan-Dec-2020-(Set-2)-sample-data-2022-08-16.csv\n",
+    "\n",
+    "ceo_set1 = pd.read_csv(ceo_set1_path)\n",
+    "ceo_set2 = pd.read_csv(ceo_set2_path)\n",
+    "\n",
+    "if ceo_set1.shape != ceo_set2.shape:\n",
+    "    print('''ERROR: The size of the two dataframes does not match. \n",
+    "          Most likely, there is a duplicate in the plotid column \n",
+    "          resulting from an error in CEO. You need to delete the \n",
+    "          duplicate manually before continuing.''')\n",
+    "    print(ceo_set1[ceo_set1.duplicated(subset=['plotid'])])\n",
+    "    print(ceo_set2[ceo_set2.duplicated(subset=['plotid'])])\n",
+    "else:\n",
+    "    print('Loaded two dataframes with equal size: {}'.format(ceo_set1.shape))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2fc058b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Sometimes there are slight variations in the labeling question used, \n",
+    "# so we get this from the question column\n",
+    "label_question = ceo_set1.columns[-1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8905f8e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ceo_agree = ceo_set1[ceo_set1[label_question] == ceo_set2[label_question]]\n",
+    "\n",
+    "print('Number of samples that are in agreement: %d out of %d (%.2f%%)' % \n",
+    "          (ceo_agree.shape[0], \n",
+    "           ceo_set1.shape[0], \n",
+    "           ceo_agree.shape[0]/ceo_set1.shape[0]*100))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "720c549b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ceo_disagree_set1 = ceo_set1[ceo_set1[label_question] != ceo_set2[label_question]]\n",
+    "ceo_disagree_set2 = ceo_set2[ceo_set1[label_question] != ceo_set2[label_question]]\n",
+    "\n",
+    "print('Number of samples that are NOT in agreement: %d out of %d (%.2f%%)' % \n",
+    "          (ceo_disagree_set1.shape[0], \n",
+    "           ceo_set1.shape[0], \n",
+    "           ceo_disagree_set1.shape[0]/ceo_set1.shape[0]*100))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "513542da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.set_option('display.max_rows', None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1275ed44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ceo_disagree_set1[['sampleid', 'email', 'flagged', 'collection_time', \n",
+    "                   'analysis_duration', 'imagery_title', label_question]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6523cce5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ceo_disagree_set2[['sampleid', 'email', 'flagged', 'collection_time', \n",
+    "                   'analysis_duration', 'imagery_title', label_question]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "97be64a4",
+   "metadata": {},
+   "source": [
+    "The above tables show the points from each of the two sets for which labelers disagreed on the assigned label. Review these as a group and determine which label should be assigned by consensus. "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}