In [1]:
import numpy as np
import pandas as pd

from compoundDB import inputtools as it
from UpdateDB_ref import Checkpoint as cp
from UpdateDB_ref import CR
from UpdateDB_ref import Update_CII as updater


*** CompoundDB module found. Will check the synonyms table to resolve CAS. ***



#### Load list into pandas

In [2]:
example_df = pd.read_csv('example.csv', sep=',')

#### Processing example dataframe
First we get CAS and then we generate a list with unique CAS. Finally we'll get the annotations for each CAS from CR database.
Is important to remember that each input should be curated manually to end up having the same format obtained at the end with example_annotations dataframe.

In [3]:
example_df.loc[:,'N.o CAS'] = example_df['N.o CAS'].str.split('/')
example_df.loc[:,'N.o CAS'] = example_df['N.o CAS'].astype(str).apply(lambda x: x.strip('[').strip(']').replace("'","").strip().replace(' and several other',''))

In [4]:
example_copy = example_df.copy()

In [5]:
new_data = {'Sustancia':[],'Listado individual':[], 'N.o CAS':[]}
for i, row in example_df.iterrows():
    subs_row = row['Sustancia']
    pref_name = row['Listado individual']
    cas_row = row['N.o CAS']
    if ',' in cas_row:
        cas_list = cas_row.split(',')
        for cas in cas_list:
            new_data['Sustancia'].append(subs_row)
            new_data['Listado individual'].append(pref_name)
            new_data['N.o CAS'].append(cas.strip())
        example_copy.drop(i, axis=0, inplace=True)
new_df = pd.DataFrame(new_data)

In [6]:
example_conc = pd.concat([example_copy, new_df])
example_conc.reset_index(inplace=True)
example_conc.drop(labels='index',axis=1, inplace=True)
example_conc.rename(columns={'N.o CAS':'CAS'},inplace=True)

#### Connection to CR
I connect to CR and extract annotations for each CAS

In [7]:
ann_df = CR.CR().get_annotations_per_CAS()

#### Preparing dataframes with annotations
Here I create a new dataframe with CR annotations for each CAS in LSR. If I use pd.merge with how='inner' it allows me to recover all CAS from LSR dataframe avoiding empty substances

In [8]:
example_annotations = example_conc.merge(ann_df, on='CAS', how='inner')

In [9]:
example_annotations.sample(10)

Unnamed: 0,Sustancia,Listado individual,CAS,source_name,original_annotation,annotation,general,category
515,,"2,4-Toluilendiamine",95-80-7,CLP Notification,STOT RE 2,STOT RE 2,Specific Target Organ Toxicity,Clinical
67,,4-Chloro-o-toluidine,95-69-2,CLP Notification,Eye Irrit. 2,Eye Irrit. 2,Eye,Clinical
437,,"4,4’-Oxydianiline",101-80-4,REACH Registration,Acute Tox. 3,Acute Tox. 3,Acute Toxicity,Other
72,,4-Chloro-o-toluidine,95-69-2,CLP Notification,Skin Irrit. 2,Skin Irrit. 2,Skin,Clinical
175,,2-Amino-4-nitrotoluene,99-55-8,REACH Registration,Carc. 2,Carc. 2,Carcinogen,CMR
375,,"3,3’-Dimethyl-4,4’-diaminodiphenilmethane",838-88-0,REACH Registration,Skin Sens. 1,Skin Sens. 1,,Sensitiser
432,,"4,4’-Oxydianiline",101-80-4,REACH Annex VI,Carc. 1B,Carc. 1B,Carcinogen,CMR
361,,"3,3’-Dimethyl-4,4’-diaminodiphenilmethane",838-88-0,CLP Notification,Aquatic Chronic 2,Aquatic Chronic 2,Aquatic Chronic,Aquatic
217,,"2,4-Diaminoanisole",615-05-4,CLP Notification,Muta. 2,Muta. 2,Mutagen,CMR
51,,Benzidine,92-87-5,REACH Annex III,Suspected mutagen,Suspected mutagen,Mutagen,CMR


#### Add dataframe to devel CII
Add new substances to CII or update the ones that are already there. I use original_annotation because it includes EUH annotations, and annotation column doesn't.

In [10]:
updater = updater.UpdateDB(host='gea', dbname='cii_v3_test_refactored', user='postgres', password='DBAdmin')

Add all the information from the dataframe: substance, chemical identifiers, structure, sources, annotations and how they're related (regulations table)

In [11]:
updater.add_all_information_from_dataframe(dataframe=example_annotations,
                                    class_name_field = 'Sustancia',
                                    preferred_name_field = 'Listado individual',
                                    chem_id_field='CAS',
                                    chem_id_type='casr_number',
                                    sourceName_field='source_name',
                                    regulation_field='original_annotation')

Add only substances by name

In [12]:
updater.add_substances_from_dataframe(dataframe=example_annotations,
                                    class_name_field = 'Sustancia',
                                    preferred_name_field = 'Listado individual')

Add chemical identifiers and substances by name

In [13]:
updater.add_chemical_identifier_from_dataframe(dataframe=example_annotations,
                                            class_name_field = 'Sustancia',
                                            preferred_name_field = 'Listado individual',
                                            chem_id_field='CAS',
                                            chem_id_type='casr_number')

Add only structures (SMILES) if present in the dataframe. If not, it generates the structure from the CAS and stores it

In [14]:
small_strucs = updater.get_substances_with_structure()[:10]

In [15]:
small_strucs

Unnamed: 0,class_name_curated,preferred_name_curated,name,structure,structure_curated,substance_type_id
0,Formaldehyde,Methanediol,463-57-0,OCO,OCO,1.0
1,Formaldehyde,Formaldehyde,50-00-0,C=O,C=O,1.0
2,"N,N-Dimethylformamide","N,N-Dimethylformamide",68-12-2,CN(C)C=O,CN(C)C=O,1.0
3,,Quinoline,91-22-5,c1ccc2ncccc2c1,c1ccc2ncccc2c1,1.0
4,N-nitrosamines,N-Nitrosodimethylamine,62-75-9,CN(C)N=O,CN(C)N=O,1.0
5,N-nitrosamines,N-Nitrosodiethylamine,55-18-5,CCN(CC)N=O,CCN(CC)N=O,1.0
6,N-nitrosamines,N-Nitrosodipropylamine,621-64-7,CCCN(CCC)N=O,CCCN(CCC)N=O,1.0
7,N-nitrosamines,N-Nitrosodibutylamine,924-16-3,CCCCN(CCCC)N=O,CCCCN(CCCC)N=O,1.0
8,N-nitrosamines,N-Nitrosopiperidine,100-75-4,O=NN1CCCCC1,O=NN1CCCCC1,1.0
9,N-nitrosamines,N-Nitrosopyrrolidine,930-55-2,O=NN1CCCC1,O=NN1CCCC1,1.0


In [17]:
updater.add_structure_from_dataframe(dataframe=small_strucs,
                                    class_name_field = 'class_name_curated',
                                    preferred_name_field = 'preferred_name_curated',
                                    chem_id_field='name',
                                    chem_id_type='casr_number',
                                    smiles_field='structure')

Add only the sources present in the dataframe

In [18]:
updater.add_source_from_dataframe(dataframe=example_annotations,
                                 sourceName_field='source_name')

Add only the annotations (hazards etc...) present in the dataframe

In [19]:
updater.add_annotation_from_dataframe(dataframe=example_annotations,
                                     annotation_field='original_annotation')