In [1]:
from pymongo import MongoClient
import pandas as pd

In [2]:
mongo_url = "mongodb://localhost:27017/"

In [3]:
database_name = "nmdc"

In [4]:
collection_name = "flattened_biosample"

In [5]:
# Connect to local MongoDB
client = MongoClient(mongo_url)

In [6]:
db = client[database_name]

In [7]:
collection = db[collection_name]

In [8]:
# Retrieve all documents in the collection
documents = list(collection.find())

In [9]:
documents[0]

{'_id': ObjectId('67e46c83379c802141a01463'),
 'id': 'nmdc:bsm-11-002vgm56',
 'abs_air_humidity_has_numeric_value': None,
 'abs_air_humidity_has_raw_value': None,
 'abs_air_humidity_has_unit': None,
 'add_date': None,
 'agrochem_addition': None,
 'alternative_identifiers': None,
 'ammonium_has_numeric_value': None,
 'ammonium_has_unit': None,
 'ammonium_nitrogen_has_numeric_value': None,
 'ammonium_nitrogen_has_raw_value': None,
 'ammonium_nitrogen_has_unit': None,
 'analysis_type': 'metagenomics',
 'ances_data_has_raw_value': None,
 'associated_studies': 'nmdc:sty-11-34xj1150',
 'avg_temp_has_numeric_value': None,
 'avg_temp_has_raw_value': None,
 'avg_temp_has_unit': None,
 'biosample_categories': 'NEON',
 'biotic_regm_has_raw_value': None,
 'calcium_has_numeric_value': None,
 'calcium_has_raw_value': None,
 'calcium_has_unit': None,
 'carb_nitro_ratio_has_numeric_value': 25.4,
 'carb_nitro_ratio_has_raw_value': None,
 'chloride_has_numeric_value': None,
 'chloride_has_unit': None,
 

In [10]:
df = pd.DataFrame(documents)

In [11]:
all_col_names = list(df.columns)

In [12]:
unit_col_names = [ i for i in all_col_names if i.endswith("_has_unit") ]

In [13]:
desired_col_names = ['id'] + unit_col_names

In [14]:
df = df[desired_col_names]

In [15]:
df

Unnamed: 0,id,abs_air_humidity_has_unit,ammonium_has_unit,ammonium_nitrogen_has_unit,avg_temp_has_unit,calcium_has_unit,chloride_has_unit,chlorophyll_has_unit,conduc_has_unit,depth_has_unit,...,subsurface_depth_has_unit,sulfate_has_unit,temp_has_unit,tot_carb_has_unit,tot_nitro_content_has_unit,tot_nitro_has_unit,tot_org_carb_has_unit,tot_phosp_has_unit,wind_speed_has_unit,zinc_has_unit
0,nmdc:bsm-11-002vgm56,,,,,,,,,m,...,,,Celsius,,,,,,,
1,nmdc:bsm-11-006pnx90,kPa,,,Cel,,,,,,...,,,,%,%,,,,m/s,
2,nmdc:bsm-11-00dkyf35,,,mg/L,,,,,,m,...,,,Celsius,,mg/L,,,,,
3,nmdc:bsm-11-00f5nh68,,,,,,,,,,...,,,,,,,,,,
4,nmdc:bsm-11-00hrxp98,,,,,,,,,m,...,,,Celsius,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13001,nmdc:bsm-13-zms1jq91,,,,,,,,,m,...,,,,,,,,,,
13002,nmdc:bsm-13-zq681s85,,,,,,,,,,...,,,,,,,,,,
13003,nmdc:bsm-13-zr9dcy94,,,,,,,,,m,...,,,,,,,,,,
13004,nmdc:bsm-13-zvz59s14,,,,,,,,,m,...,,,,,,,,,,


In [18]:
# Melt the DataFrame using 'id' as the identifier
melted_df = pd.melt(df, id_vars=["id"], var_name="source_column", value_name="value")


In [19]:
# Remove rows where 'value' is NA, null, etc.
melted_df = melted_df.dropna(subset=["value"])

In [20]:
melted_df

Unnamed: 0,id,source_column,value
1,nmdc:bsm-11-006pnx90,abs_air_humidity_has_unit,kPa
39,nmdc:bsm-11-0435yq18,abs_air_humidity_has_unit,kPa
70,nmdc:bsm-11-06mvg980,abs_air_humidity_has_unit,kPa
92,nmdc:bsm-11-07nqms41,abs_air_humidity_has_unit,kPa
94,nmdc:bsm-11-07qp3e36,abs_air_humidity_has_unit,kPa
...,...,...,...
570790,nmdc:bsm-11-ynkz2756,zinc_has_unit,mg/kg
570820,nmdc:bsm-11-yqjjes90,zinc_has_unit,mg/kg
570822,nmdc:bsm-11-yqrv3r15,zinc_has_unit,mg/kg
570956,nmdc:bsm-11-z423gx54,zinc_has_unit,mg/kg


In [21]:
# Group by 'source_column' and count occurrences of each value in 'value'
value_counts = melted_df.groupby('source_column')['value'].value_counts()


In [22]:
# Optional: Reset index to convert the result into a DataFrame with columns: source_column, value, and counts
value_counts_df = value_counts.reset_index(name='counts')

In [23]:
value_counts_df

Unnamed: 0,source_column,value,counts
0,abs_air_humidity_has_unit,kPa,192
1,ammonium_has_unit,mg/L,45
2,ammonium_nitrogen_has_unit,mg/L,1239
3,ammonium_nitrogen_has_unit,mg/kg,103
4,avg_temp_has_unit,Cel,192
5,calcium_has_unit,mg/kg,103
6,calcium_has_unit,mg/L,6
7,chloride_has_unit,mg/L,6
8,chlorophyll_has_unit,μg/L,47
9,conduc_has_unit,uS/cm,180


In [25]:
value_counts_df.to_csv("nmdc_mongo_biosample_unit_counts.tsv", sep="\t", index=False)