In [1]:
import numpy as np
import pandas as pd

from compoundDB import inputtools as it
from UpdateDB import Checkpoint as cp
from UpdateDB import CR
from UpdateDB import Update_CII as updater


*** CompoundDB module found. Will check the synonyms table to resolve CAS. ***



#### Load list into pandas

In [2]:
example_df = pd.read_csv('example.csv', sep=',')

#### Checkpoint from CII release version
I make a checkpoint from the stable version just in case I need to roll back the new additions in my database

In [3]:
release_checkpoint = cp.Checkpoint('gea','CII','postgres','DBAdmin')

In [4]:
tables_checkpoint = release_checkpoint.get_max_id_for_each_table()

#### Processing example dataframe
First we get CAS and then we generate a list with unique CAS. Finally we'll get the annotations for each CAS from CR database.
Is important to remember that each input should be curated manually to end up having the same format obtained at the end with example_annotations dataframe.

In [5]:
example_df.loc[:,'N.o CAS'] = example_df['N.o CAS'].str.split('/')
example_df.loc[:,'N.o CAS'] = example_df['N.o CAS'].astype(str).apply(lambda x: x.strip('[').strip(']').replace("'","").strip().replace(' and several other',''))

In [6]:
example_copy = example_df.copy()

In [7]:
new_data = {'Sustancia':[],'Listado individual':[], 'N.o CAS':[]}
for i, row in example_df.iterrows():
    subs_row = row['Sustancia']
    pref_name = row['Listado individual']
    cas_row = row['N.o CAS']
    if ',' in cas_row:
        cas_list = cas_row.split(',')
        for cas in cas_list:
            new_data['Sustancia'].append(subs_row)
            new_data['Listado individual'].append(pref_name)
            new_data['N.o CAS'].append(cas.strip())
        example_copy.drop(i, axis=0, inplace=True)
new_df = pd.DataFrame(new_data)

In [8]:
example_conc = pd.concat([example_copy, new_df])
example_conc.reset_index(inplace=True)
example_conc.drop(labels='index',axis=1, inplace=True)
example_conc.rename(columns={'N.o CAS':'CAS'},inplace=True)

#### Connection to CR
I connect to CR and extract annotations for each CAS

In [9]:
ann_df = CR.CR().get_annotations_per_CAS()

#### Preparing dataframes with annotations
Here I create a new dataframe with CR annotations for each CAS in LSR. If I use pd.merge with how='inner' it allows me to recover all CAS from LSR dataframe avoiding empty substances

In [10]:
example_annotations = example_conc.merge(ann_df, on='CAS', how='inner')

In [11]:
example_annotations.sample(10)

Unnamed: 0,Sustancia,Listado individual,CAS,source_name,original_annotation,annotation,general,category
349,,"3,3’-Dimethylbenzidine",119-93-7,REACH Annex III,Suspected persistent in the environment,Persistent in the environment,Persistent,PBT
370,,"3,3’-Dimethyl-4,4’-diaminodiphenilmethane",838-88-0,REACH Annex VI,Skin Sens. 1,Skin Sens. 1,,Sensitiser
248,,"4,4’-Diaminodiphenylmethane",101-77-9,CLP Notification,Carc. 1B,Carc. 1B,Carcinogen,CMR
237,,"2,4-Diaminoanisole",615-05-4,REACH Annex VI,Acute Tox. 4 *,Acute Tox. 4,Acute Toxicity,Other
314,,"3,3’-Dimetoxybenzidine",119-90-4,REACH Annex III,Harmonised classification for carcinogenicity,Harmonised classification for carcinogenicity,Carcinogen,CMR
505,,"2,4-Toluilendiamine",95-80-7,CLP Notification,Acute Tox. 4,Acute Tox. 4,Acute Toxicity,Other
353,,"3,3’-Dimethylbenzidine",119-93-7,REACH Annex VI,Acute Tox. 4 *,Acute Tox. 4,Acute Toxicity,Other
274,,"4,4’-Diaminodiphenylmethane",101-77-9,SVHC,PBT,PBT,PBT,PBT
271,,"4,4’-Diaminodiphenylmethane",101-77-9,REACH Registration,Skin Sens. 1,Skin Sens. 1,,Sensitiser
426,,"4,4’-Oxydianiline",101-80-4,CLP Notification,STOT RE 2,STOT RE 2,Specific Target Organ Toxicity,Clinical


#### Add dataframe to devel CII
Add new substances to CII or update the ones that are already there. I use original_annotation because it includes EUH annotations, and annotation column doesn't.

In [12]:
updater = updater.UpdateDB(host='localhost', dbname='cii_v0', user='postgres', password='PSGAdmin')

In [13]:
# updater.add_all_information_from_dataframe(dataframe=example_annotations,
#                                     class_name_field = 'Sustancia',
#                                     preferred_name_field = 'Listado individual',
#                                     chem_id_field='CAS',
#                                     chem_id_type='casr_number',
#                                     sourceName_field='source_name',
#                                     regulation_field='original_annotation')

In [14]:
# updater.add_substances_from_dataframe(dataframe=example_annotations,
#                                     class_name_field = 'Sustancia',
#                                     preferred_name_field = 'Listado individual')

In [15]:
# updater.add_chemical_identifier_from_dataframe(dataframe=example_annotations,
#                                             class_name_field = 'Sustancia',
#                                             preferred_name_field = 'Listado individual',
#                                             chem_id_field='CAS',
#                                             chem_id_type='casr_number')

In [16]:
# small_strucs = updater.get_substances_with_structure()[:10]

In [17]:
# updater.add_structure_from_dataframe(dataframe=small_strucs,
#                                     class_name_field = 'class_name',
#                                     preferred_name_field = 'preferred_name',
#                                     chem_id_field='name',
#                                     chem_id_type='casr_number',
#                                     smiles_field='structure')

In [18]:
# updater.add_source_from_dataframe(dataframe=example_annotations,
#                                  sourceName_field='source_name')

In [19]:
updater.add_annotation_from_dataframe(dataframe=example_annotations,
                                     annotation_field='original_annotation')

string 1201
string 160
string 332
string 165
string 386
string 212
string 1117
string 890
string 1344
string 333
string 333
string 49
string 49
string 1350
string 1350
string 1351
string 1351
string 1358
string 1358
string 1359
string 1359
string 1360
string 1360
string 147
string 160
string 480
string 1361
string 428
string 480
string 1361
string 1201
string 1129
string 499
string 160
string 1362
string 1344
string 333
string 333
string 338
string 338
string 49
string 49
string 1363
string 1363
string 1350
string 1350
string 1351
string 1351
string 1358
string 1358
string 1359
string 1359
string 1353
string 1353
string 147
string 1129
string 499
string 160
string 480
string 1361
string 381
string 1201
string 1129
string 499
string 332
string 165
string 212
string 1288
string 789
string 1117
string 890
string 333
string 333
string 338
string 338
string 49
string 49
string 1364
string 1364
string 1363
string 1363
string 1350
string 1350
string 1351
string 1351
string 1358
string 1358
st