In [1]:
from pymongo import MongoClient
import csv

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np


In [2]:
# Save the list of dicts to a TSV file
tsv_file_path = "macrae_crerar_attributes.tsv"

In [3]:
# Number of clusters (you can adjust this based on your needs)
num_clusters = 3

In [4]:
# Connect to the local MongoDB instance (no authentication)
client = MongoClient("mongodb://localhost:27017/")

In [5]:
# Access your database and collection
db = client["ncbi_metadata"]  # Replace with your database name
collection = db["biosamples"]  # Replace with your collection name

In [6]:
# Define the query
query = {
    "Attributes.Attribute": {
        "$elemMatch": {
            "attribute_name": "emp500_principal_investigator",
            "content": "MacRae-Crerar"
        }
    }
}

In [7]:
# Execute the query
results = collection.find(query)

In [8]:
# Build a list of dictionaries
documents_list = []

for document in results:
    doc_dict = {}

    # Add accession as a top-level key
    accession = document.get('accession')
    doc_dict['accession'] = accession

    # Add each attribute_name as a key with content as its value
    attributes = document.get('Attributes', {}).get('Attribute', [])
    for attribute in attributes:
        attribute_name = attribute.get('attribute_name')
        content = attribute.get('content')
        doc_dict[attribute_name] = content

    documents_list.append(doc_dict)

In [9]:
def dms_to_decimal(degrees):
    # Assuming the degree value is in a decimal format separating degrees, minutes, and seconds
    try:
        degree, minute, second = map(float, degrees.split('.'))
        return degree + (minute / 60) + (second / 3600)
    except ValueError:
        return None  # If the conversion fails, return None

In [10]:
# Iterate over the list and add the decimal latitudes and longitudes
for record in documents_list:
    record['latitude_decimal'] = dms_to_decimal(record['latitude'])
    record['longitude_decimal'] = dms_to_decimal(record['longitude'])

In [11]:
# Extract the relevant data for clustering
coordinates = np.array(
    [[
        record['latitude_decimal'],
        record['longitude_decimal'],
        record['soil_moisture_ave'],
    ] for record in
        documents_list])

In [12]:
# Normalize the data using StandardScaler
scaler = StandardScaler()
normalized_coordinates = scaler.fit_transform(coordinates)

In [13]:
# Perform KMeans clustering
kmeans = KMeans(n_clusters=num_clusters)
clusters = kmeans.fit_predict(coordinates)

In [14]:
# Assign the cluster labels to the original data
for idx, record in enumerate(documents_list):
    record['cluster'] = clusters[idx]

In [15]:
# Get all unique keys for the TSV header
all_keys = set()
for doc in documents_list:
    all_keys.update(doc.keys())
all_keys = list(all_keys)

In [16]:
# Ensure 'accession' appears first, and the rest of the keys are sorted alphabetically
all_keys = ['accession'] + sorted(k for k in all_keys if k != 'accession')

In [17]:
# Write to TSV
with open(tsv_file_path, "w", newline='', encoding="utf-8") as tsvfile:
    writer = csv.DictWriter(tsvfile, fieldnames=all_keys, delimiter='\t')
    writer.writeheader()
    writer.writerows(documents_list)

print(f"TSV file saved to {tsv_file_path}")

TSV file saved to macrae_crerar_attributes.tsv


In [18]:
# Close the connection
client.close()