In [1]:
import polars as pl  # for data manipulation
import numpy as np  # specialy for NaN
import sys  # for command line arguments
import getopt  # for checking command line arguments
import datetime  # for naming the output file

import zenodo_downloader as zd
from InquirerPy.validator import PathValidator

def read_LOTUS_dataset(file_to_sample):
    df = pl.read_csv(
        file_to_sample,
        dtypes={
            "structure_xlogp": pl.Float32,
            "structure_cid": pl.UInt32,
            "organism_taxonomy_ncbiid": pl.UInt32,
            "organism_taxonomy_ottid": pl.UInt32,
            "structure_stereocenters_total": pl.UInt32,
            "structure_stereocenters_unspecified": pl.UInt32,
        },
        separator=",",
        infer_schema_length=50000,
        null_values=["", "NA"],
    )

    # print("Before type: ", df["organism_taxonomy_gbifid"].dtype)

    if not df["organism_taxonomy_gbifid"].dtype.is_numeric():
        df = df.with_columns(
            pl.col("organism_taxonomy_gbifid")
            .map_elements(lambda x: np.nan if x.startswith("c(") else x, return_dtype=pl.Float64)
            .cast(pl.Int32, strict=False)  # Cast with strict=False to allow NaN to be retained
            .alias("organism_taxonomy_gbifid")
        )
    else:
        df = df.with_columns(
            pl.col("organism_taxonomy_gbifid")
            .cast(pl.Int32, strict=False)  # Cast with strict=False to allow NaN to be retained
            .alias("organism_taxonomy_gbifid")
        )

    # print("After type: ", df["organism_taxonomy_gbifid"].dtype)
    return df

pl.Config(fmt_str_lengths=550)


<polars.config.Config at 0x7effc5d99cc0>

In [2]:
df = read_LOTUS_dataset("../data/230106_frozen_metadata.csv.gz")

df = df.select(
                [
                "structure_wikidata",
                "structure_smiles",
                ]
            ).rename(
                {
                "structure_wikidata": "id",
                "structure_smiles": "smiles"
                }
            ).unique()

In [3]:
# Group by the 'id' column and create a new DataFrame with numbered IDs only for duplicates
grouped_df = df.group_by('id').agg(pl.col('smiles'))

# Function to add an index number if there are multiple SMILES for the same ID
def apply_index(row):
    smiles_list = row['smiles']
    id = row['id']
    if len(smiles_list) > 1:
        return [(id + f"_id{i+1}", smiles) for i, smiles in enumerate(smiles_list)]
    else:
        return [(id, smiles_list[0])]

# Apply the function to each group and flatten the list of lists into a single list of tuples
indexed_smiles = [entry for row in grouped_df.to_dicts() for entry in apply_index(row)]

# Convert the list of tuples back to a Polars DataFrame
indexed_df = pl.DataFrame(indexed_smiles, schema=["id", "smiles"])

indexed_df

id,smiles
str,str
"""http://www.wikidata.org/entity/Q104913721""","""COC1=CC[C@]23CCN(C)[C@@]24C[C@H](O[C@@]14O)c1ccc(OC)c(OC)c13"""
"""http://www.wikidata.org/entity/Q77380861""","""COC(=O)[C@H](C)CNC(=O)[C@H](Cc1ccc(OC)c(Cl)c1)NC(=O)/C=C/C[C@H](OC(=O)[C@@H](O)CC(C)C)[C@H](C)[C@H]1O[C@@H]1c1ccccc1"""
"""http://www.wikidata.org/entity/Q105251498""","""CC(/C=C/[C@]1(C)[C@H](C)CCC(=O)[C@H]1C)=C\Cc1c(O)c(Cl)c(C)c(C=O)c1O"""
"""http://www.wikidata.org/entity/Q105361919""","""Cc1ccc2c(c1)O[C@H](C[C@H](C)O)C[C@@H]2C"""
"""http://www.wikidata.org/entity/Q104995341""","""CC(=O)OC/C=C/C#CC#C/C=C/C=C/[C@@H](CCOC(C)=O)OC(C)=O"""
…,…
"""http://www.wikidata.org/entity/Q105273276""","""COc1cc(-c2oc3cc(O)cc(O)c3c(=O)c2O[C@@H]2O[C@@H](CO[C@@H]3O[C@@H](C)[C@H](O)[C@@H](O)[C@H]3O)[C@@H](O)[C@H](O)[C@H]2O)ccc1O"""
"""http://www.wikidata.org/entity/Q105028059""","""COc1cc([C@H]2OC[C@H]3[C@H]2CO[C@H]3c2ccc(O)c(OC)c2)ccc1O"""
"""http://www.wikidata.org/entity/Q105143876""","""CC1(C)CC(=O)C=C(CO)[C@H]1CO[C@@H]1O[C@H](CO)[C@@H](O)[C@H](O)[C@H]1O"""
"""http://www.wikidata.org/entity/Q110077718""","""[C-]#[N+]C12C3OC3C(C)(C)c3[nH]c4cccc5c4c3C1(O)C(CC(Cl)C2(C)C=C)C5(C)C"""


In [4]:
indexed_df

# Filter rows where the 'id' column contains the substring 'saying'
filtered_df = indexed_df.filter(pl.col('id').str.contains('_id'))

# Print the filtered DataFrame
print(filtered_df)

shape: (102, 2)
┌──────────────────────────────────────────────┬───────────────────────────────────────────────────┐
│ id                                           ┆ smiles                                            │
│ ---                                          ┆ ---                                               │
│ str                                          ┆ str                                               │
╞══════════════════════════════════════════════╪═══════════════════════════════════════════════════╡
│ http://www.wikidata.org/entity/Q5172181_id1  ┆ CC[C@H]1C[C@@H]2CN3CCc4c([nH]c5ccccc45)[C@](C(=O) │
│                                              ┆ OC)(C2)[C@H]13                                    │
│ http://www.wikidata.org/entity/Q5172181_id2  ┆ CC[C@H]1C[C@H]2CN3CCc4c([nH]c5ccccc45)[C@](C(=O)O │
│                                              ┆ C)(C2)[C@H]13                                     │
│ http://www.wikidata.org/entity/Q988591_id1   ┆ CC/C=C\[C@@H]1C=CCC=CC1   

In [5]:
id_shape = indexed_df.unique(subset=["id"]).shape

smiles_shape = indexed_df.unique(subset=["smiles"]).shape

print(f'id: {id_shape}\nsmiles: {smiles_shape}')

id: (220834, 2)
smiles: (220820, 2)
