In [13]:
import json
import pandas as pd
import re

discovered via

```sqlSELECT
  attr.k AS attribute_key,
  COUNT(*) AS count
FROM
  `nih-sra-datastore.sra.metadata`,
  UNNEST(attributes) AS attr
GROUP BY
  attr.k
ORDER BY
  count DESC;
```

and

```js
db.getCollection("your_collection").aggregate(
  [
    { "$match": { "Attributes.Attribute.harmonized_name": "env_broad_scale" } },
    { "$unwind": "$Attributes.Attribute" },
    { "$match": { "Attributes.Attribute.harmonized_name": "env_broad_scale" } },
    { "$group": { "_id": "$Attributes.Attribute.attribute_name", "count": { "$sum": 1 } } },
    { "$project": { "_id": 0, "attribute_name": "$_id", "count": 1 } }
  ],
  { allowDiskUse: true }
);
```

In [14]:
json_files = [
    'env_broad_scale_mapees.json',
    'env_local_scale_mapees.json',
    'env_medium_mapees.json',
]

In [15]:
def normalize_name_for_sra_metadata_attribute_key(name):
    """Convert to lowercase and replace all punctuation and whitespace with underscores while preserving character count."""
    normalized = re.sub(r'[\s\W]', '_', name.lower())  # Replace each non-word character with a single underscore
    return f"{normalized}_sam"

In [19]:
def list_to_sql_predicate(string_list):
    # Join the strings with commas, wrap each in quotes, and surround with parentheses
    predicate = "(" + ", ".join(f"'{item}'" for item in string_list) + ")"
    return predicate


In [20]:
def json_to_tsv(json_file, tsv_file):
    # Load JSON data from file
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Normalize names
    for item in data:
        item["normalized_name"] = normalize_name_for_sra_metadata_attribute_key(item["attribute_name"])

    # Convert JSON to DataFrame
    df = pd.DataFrame(data)

    count_sum = df['count'].sum()
    one_pct = count_sum * 0.01
    one_pct_frame = df[df['count'] > one_pct]
    one_pct_strings = list(one_pct_frame['normalized_name'])
    print(list_to_sql_predicate(one_pct_strings))

    # Save as TSV
    df.to_csv(tsv_file, sep="\t", index=False)

In [21]:
for json_file in json_files:
    tsv_file = json_file.replace(".json", ".tsv")
    print(f"{json_file} -> {tsv_file}")
    json_to_tsv(json_file, tsv_file)

env_broad_scale_mapees.json -> env_broad_scale_mapees.tsv
('env_broad_scale_sam', 'broad_scale_environmental_context_sam', 'env_biome_sam', 'env_biome_sam', 'biome_sam', 'environment__biome__sam')
env_local_scale_mapees.json -> env_local_scale_mapees.tsv
('feature_sam', 'env_feature_sam', 'environment__feature__sam', 'env_local_scale_sam', 'env_feature_sam')
env_medium_mapees.json -> env_medium_mapees.tsv
('env_material_sam', 'environment__material__sam', 'env_medium_sam', 'environmental_medium_sam', 'material_sam')


for building this kind of query

```sql
WITH filtered_data AS (
  SELECT
    bioproject
  FROM
    `nih-sra-datastore.sra.metadata`
  WHERE
    avgspotlen >= 150
    AND mbases >= 10
    AND platform = 'ILLUMINA'
  GROUP BY
    bioproject
  HAVING
    COUNT(*) >= 50
)
SELECT *
FROM `nih-sra-datastore.sra.metadata` m
WHERE
  m.bioproject IN (SELECT bioproject FROM filtered_data)
  -- Must contain at least one key from the broad/biome category
  AND EXISTS (
    SELECT 1
    FROM UNNEST(m.attributes) AS attr
    WHERE attr.k IN (
      'env_broad_scale_sam',
      'broad_scale_environmental_context_sam',
      'env_biome_sam',
      'biome_sam',
      'environment__biome__sam'
    )
  )
  -- Must contain at least one key from the feature/local scale category
  AND EXISTS (
    SELECT 1
    FROM UNNEST(m.attributes) AS attr
    WHERE attr.k IN (
      'feature_sam',
      'env_feature_sam',
      'environment__feature__sam',
      'env_local_scale_sam'
    )
  )
  -- Must contain at least one key from the material/medium category
  AND EXISTS (
    SELECT 1
    FROM UNNEST(m.attributes) AS attr
    WHERE attr.k IN (
      'env_material_sam',
      'environment__material__sam',
      'env_medium_sam',
      'environmental_medium_sam',
      'material_sam'
    )
  );

```

that gets 2 724 015 rows out of ~ 35 000 000 in 1311 seconds