opensupplyhub · TaiWilkin · May 6, 2022 · Apr 29, 2022 · May 2, 2022 · jwalgran
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ### Added
 
 - Add new taxonomy processing types [#1827](https://github.com/open-apparel-registry/open-apparel-registry/pull/1827)
+- Add extended field reports [#1826](https://github.com/open-apparel-registry/open-apparel-registry/pull/1826)
 
 ### Changed
 

diff --git a/src/django/api/migrations/0090_remove_duplicate_field_name_choices.py b/src/django/api/migrations/0090_remove_duplicate_field_name_choices.py
@@ -0,0 +1,23 @@
+# Generated by Django 2.2.24 on 2022-05-05 11:52
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('api', '0089_reprocess_parent_company'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='extendedfield',
+            name='field_name',
+            field=models.CharField(choices=[('name', 'name'), ('address', 'address'), ('number_of_workers', 'number_of_workers'), ('native_language_name', 'native_language_name'), ('facility_type', 'facility_type'), ('processing_type', 'processing_type'), ('product_type', 'product_type'), ('parent_company', 'parent_company')], help_text='The name of the field, chosen from a strict list.', max_length=200),
+        ),
+        migrations.AlterField(
+            model_name='historicalextendedfield',
+            name='field_name',
+            field=models.CharField(choices=[('name', 'name'), ('address', 'address'), ('number_of_workers', 'number_of_workers'), ('native_language_name', 'native_language_name'), ('facility_type', 'facility_type'), ('processing_type', 'processing_type'), ('product_type', 'product_type'), ('parent_company', 'parent_company')], help_text='The name of the field, chosen from a strict list.', max_length=200),
+        ),
+    ]
diff --git a/src/django/api/models.py b/src/django/api/models.py
@@ -2372,7 +2372,6 @@ class ExtendedField(models.Model):
     Fields will be related to either a claim or list item; they must reference
     one, but not both.
     """
-    COUNTRY = 'country'
     NAME = 'name'
     ADDRESS = 'address'
     NUMBER_OF_WORKERS = 'number_of_workers'
@@ -2385,17 +2384,14 @@ class ExtendedField(models.Model):
     PROCESSING_TYPE = 'processing_type'
 
     FIELD_CHOICES = (
-        (COUNTRY, COUNTRY),
         (NAME, NAME),
         (ADDRESS, ADDRESS),
         (NUMBER_OF_WORKERS, NUMBER_OF_WORKERS),
         (NATIVE_LANGUAGE_NAME, NATIVE_LANGUAGE_NAME),
         (FACILITY_TYPE, FACILITY_TYPE),
         (PROCESSING_TYPE, PROCESSING_TYPE),
         (PRODUCT_TYPE, PRODUCT_TYPE),
-        (PARENT_COMPANY, PARENT_COMPANY),
-        (FACILITY_TYPE, FACILITY_TYPE),
-        (PROCESSING_TYPE, PROCESSING_TYPE))
+        (PARENT_COMPANY, PARENT_COMPANY))
 
     contributor = models.ForeignKey(
         'Contributor',

diff --git a/src/django/api/reports.py b/src/django/api/reports.py
@@ -1,22 +1,32 @@
+import json
 import os
 import pytz
+from collections import defaultdict
 
 from datetime import datetime
 from dateutil.relativedelta import relativedelta
+from django.conf import settings
 from django.utils import timezone
 from glob import glob
 from urllib.parse import quote
 
 from django.db import connection
-from django.db.models import Func, F
-from api.models import HistoricalFacility, FacilityListItem
+from django.db.models import Func, F, Q
+from api.models import HistoricalFacility, FacilityListItem, ExtendedField
 from api.constants import ProcessingAction, DateFormats
 
 _root = os.path.abspath(os.path.dirname(__file__))
 
 
-def sort_by_first_column(array):
-    return sorted(array, key=lambda x: x[0])
+def _report_error_to_rollbar(message, extra_data=None):
+    ROLLBAR = getattr(settings, 'ROLLBAR', {})
+    if ROLLBAR:
+        import rollbar
+        rollbar.report_message(message, level='error', extra_data=extra_data)
+
+
+def sort_by_first_column(array, reverse=False):
+    return sorted(array, key=lambda x: x[0], reverse=reverse)
 
 
 def try_parsing_date(text):
@@ -120,6 +130,153 @@ def weekly_geocoding_time_with_queue():
     return [['week', 'average_geocoding_time_in_seconds'], rows]
 
 
+def submitted_product_type_values():
+    data = defaultdict(int)
+
+    product_type_lists = ExtendedField.objects.filter(
+        field_name='product_type'
+    ).values_list('value__raw_values', flat=True)
+    for product_types in product_type_lists:
+        for p in product_types:
+            data[p.strip()] = data[p.strip()] + 1
+    rows = sort_by_first_column(data.items())
+    return [['product_type_values', 'times_submitted'], rows]
+
+
+def value_is_valid(value):
+    return (value is not None and value.strip() != ""
+            and value.lower() != "other"
+            and value.lower() != "denim services"
+            and value.lower() != "boarding")
+
+
+def process_raw_values(raw_values):
+    values = raw_values
+
+    if isinstance(raw_values, str):
+        values = (raw_values.split('|') if '|' in raw_values
+                  else [raw_values])
+
+    deduped_values = list(dict.fromkeys(values))
+
+    filtered_values = []
+
+    for value in deduped_values:
+        if value_is_valid(value):
+                filtered_values.append(value)
+
+    return filtered_values
+
+
+def processing_type_facility_type_matched():
+    temp_data = defaultdict(set)
+
+    for (raw_values, matched_values) in ExtendedField.objects.filter(
+        Q(field_name='processing_type') | Q(field_name='facility_type')
+    ).distinct(
+        'value__raw_values', 'value__matched_values'
+    ).values_list('value__raw_values', 'value__matched_values').iterator():
+        values = process_raw_values(raw_values)
+
+        if len(values) == len(matched_values):
+            for i, raw_value in enumerate(values):
+                if matched_values[i][3] is not None:
+                    temp_data[matched_values[i][3]].add(raw_value.strip())
+                    temp_data[matched_values[i][2]].add(raw_value.strip())
+        else:
+            _report_error_to_rollbar((
+                    'processing_type_facility_type_matched encountered '
+                    'mismatched processing type value count'
+                ), extra_data={
+                    'deduped_values': json.dumps(values),
+                    'matched_values': json.dumps(matched_values),
+                })
+
+    data = dict()
+    for (raw_value, match_values) in temp_data.items():
+        data[raw_value] = ' | '.join(list(match_values))
+
+    rows = sort_by_first_column(data.items())
+    return [['taxonomy_value', 'raw_values'], rows]
+
+
+def processing_type_facility_type_unmatched():
+    data = defaultdict(int)
+
+    for (raw_values, matched_values) in ExtendedField.objects.filter(
+        Q(field_name='processing_type') | Q(field_name='facility_type')
+    ).distinct(
+        'value__raw_values', 'value__matched_values'
+    ).values_list('value__raw_values', 'value__matched_values').iterator():
+        values = process_raw_values(raw_values)
+
+        if len(values) == len(matched_values):
+            for i, raw_value in enumerate(values):
+                if matched_values[i][3] is None:
+                    data[raw_value.strip()] = data[raw_value.strip()] + 1
+            else:
+                _report_error_to_rollbar((
+                        'processing_type_facility_type_unmatched encountered '
+                        'mismatched processing type value count'
+                    ), extra_data={
+                        'deduped_values': json.dumps(values),
+                        'matched_values': json.dumps(matched_values),
+                    })
+
+    rows = sort_by_first_column(data.items())
+    return [['processing_type_facility_type', 'times_submitted'], rows]
+
+
+def get_extended_field_columns():
+    columns = ['month']
+    for field_name, _ in ExtendedField.FIELD_CHOICES:
+        if (field_name == 'name' or field_name == 'address'):
+            columns.append('{} Claimed'.format(field_name))
+        else:
+            columns.append(field_name)
+    return columns
+
+
+def contributors_with_extended_fields_cumulative():
+    data = defaultdict(list)
+    months = ExtendedField.objects.dates('created_at', 'month')
+    for month in months:
+        m = month.strftime(DateFormats.MONTH)
+        data[m] = [m]
+        fields = ExtendedField.objects.filter(
+            created_at__year__lte=month.year,
+            created_at__month__lte=month.month)
+
+        for field_name, _ in ExtendedField.FIELD_CHOICES:
+            field_count = fields.filter(field_name=field_name).distinct(
+                'contributor_id').count()
+            data[m].append(field_count)
+
+    columns = get_extended_field_columns()
+    rows = sort_by_first_column(data.values(), reverse=True)
+    return [columns, rows]
+
+
+def facilities_with_extended_fields_cumulative():
+    data = defaultdict(list)
+    months = ExtendedField.objects.dates('created_at', 'month')
+    for month in months:
+        m = month.strftime(DateFormats.MONTH)
+        data[m] = [m]
+        fields = ExtendedField.objects.filter(
+            created_at__year__lte=month.year,
+            created_at__month__lte=month.month)
+
+        for field_name, _ in ExtendedField.FIELD_CHOICES:
+            field_count = fields.filter(field_name=field_name).distinct(
+                'facility_id').count()
+            data[m].append(field_count)
+
+    columns = get_extended_field_columns()
+    rows = sort_by_first_column(data.values(), reverse=True)
+    return [columns, rows]
+
+
 NON_SQL_REPORTS = {
     'monthly_promoted_name_and_address': monthly_promoted_name_and_address,
     'recent_monthly_geocoding_time_without_queue':
@@ -128,7 +285,17 @@ def weekly_geocoding_time_with_queue():
     weekly_geocoding_time_without_queue,
     'recent_monthly_geocoding_time_with_queue':
     monthly_geocoding_time_with_queue,
-    'recent_weekly_geocoding_time_with_queue': weekly_geocoding_time_with_queue
+    'recent_weekly_geocoding_time_with_queue':
+    weekly_geocoding_time_with_queue,
+    'submitted_product_type_values': submitted_product_type_values,
+    'processing_type_facility_type_matched_values':
+    processing_type_facility_type_matched,
+    'processing_type_facility_type_unmatched_values':
+    processing_type_facility_type_unmatched,
+    'contributors_with_extended_fields_cumulative':
+    contributors_with_extended_fields_cumulative,
+    'facilities_with_extended_fields_cumulative':
+    facilities_with_extended_fields_cumulative
 }