In [1]:
import numpy as np
import pandas as pd

from compoundDB import inputtools as it
from UpdateDB import Update_CII as updater


*** CompoundDB module found. Will check the synonyms table to resolve CAS. ***



#### Load list into pandas

In [2]:
example_df = pd.read_csv('example.csv', sep=',')

#### Processing example dataframe
First we get CAS and then we generate a list with unique CAS. Finally we'll get the annotations for each CAS from CR database.
Is important to remember that each input should be curated manually to end up having the same format obtained at the end with example_annotations dataframe.

In [3]:
example_df.loc[:,'N.o CAS'] = example_df['N.o CAS'].str.split('/')
example_df.loc[:,'N.o CAS'] = example_df['N.o CAS'].astype(str).apply(lambda x: x.strip('[').strip(']').replace("'","").strip().replace(' and several other',''))

In [4]:
example_copy = example_df.copy()

In [5]:
new_data = {'Sustancia':[],'Listado individual':[], 'N.o CAS':[]}
for i, row in example_df.iterrows():
    subs_row = row['Sustancia']
    pref_name = row['Listado individual']
    cas_row = row['N.o CAS']
    if ',' in cas_row:
        cas_list = cas_row.split(',')
        for cas in cas_list:
            new_data['Sustancia'].append(subs_row)
            new_data['Listado individual'].append(pref_name)
            new_data['N.o CAS'].append(cas.strip())
        example_copy.drop(i, axis=0, inplace=True)
new_df = pd.DataFrame(new_data)

In [6]:
example_conc = pd.concat([example_copy, new_df])
example_conc.reset_index(inplace=True)
example_conc.drop(labels='index',axis=1, inplace=True)
example_conc.rename(columns={'N.o CAS':'CAS'},inplace=True)

#### Connection to CR
I connect to CR and extract annotations for each CAS

In [7]:
cr_con = it.openconnection(host='gea', password='DBAdmin')

In [8]:
cr_db = pd.read_sql_query("""SELECT synonym.type, synonym.name as reg_number, source.name as source_name, 
subs_ann.original_annotation, annotation.annotation, annotation.general, annotation.category, source.latest
FROM substance sub
left join synonym on synonym.subsid = sub.id
left join source on source.id = sub.sourceid
left join subs_ann on subs_ann.subsid = sub.id
left join annotation on annotation.id = subs_ann.annid
where synonym.type like '%CAS%'
order by synonym.name ASC""", cr_con)

In [9]:
cr_db.drop_duplicates(inplace=True)
cr_db.drop(labels=cr_db.loc[cr_db['reg_number'].isin(['-','_','---','—']),:].index, axis=0, inplace=True)
cr_db.sort_values(by=['reg_number','source_name','original_annotation','annotation'], inplace=True)
cr_db.rename(columns={'reg_number':'CAS'},inplace=True)

#### Preparing dataframes with annotations
Here I create a new dataframe with CR annotations for each CAS in LSR. If I use pd.merge with how='left' it allows me to recover all CAS from LSR dataframe

In [10]:
example_annotations = example_conc.merge(cr_db, on='CAS', how='left')

In [11]:
example_annotations.sample(10)

Unnamed: 0,Sustancia,Listado individual,CAS,type,source_name,original_annotation,annotation,general,category,latest
187,,o-Aminoazotoluene,97-56-3,CAS,REACH Annex III,Harmonised classification for carcinogenicity,Harmonised classification for carcinogenicity,Carcinogen,CMR,True
724,,o-Toluidine,95-53-4,CAS,REACH Annex VI,Aquatic Acute 1,Aquatic Acute 1,Aquatic Acute,Aquatic,True
149,,2-Naphthylamine,91-59-8,CAS,REACH Annex III,Suspected acutely toxic via the oral route,Suspected acutely toxic via the oral route,,Other,True
283,,4-Chloroaniline,106-47-8,CAS,REACH Annex VI,vPvB,vPvB,,vPvB,False
683,,"4,4’-Thiodianiline",139-65-1,CAS,REACH Annex III,Suspected persistent in the environment,Persistent in the environment,Persistent,PBT,True
113,,4-Chloro-o-toluidine,95-69-2,CAS,REACH Annex III,Suspected hazardous to the aquatic environment,Hazardous to the aquatic environment,,Aquatic,True
127,,4-Chloro-o-toluidine,95-69-2,CAS,REACH Annex VI,Carc. 1B,Carc. 1B,Carcinogen,CMR,True
687,,"4,4’-Thiodianiline",139-65-1,CAS,REACH Annex VI,Acute Tox. 4 *,Acute Tox. 4,Acute Toxicity,Other,False
462,,"3,3’-Dimetoxybenzidine",119-90-4,CAS No,REACH Annex VI,"Carc. 1B, Acute Tox. 4 *","Carc. 1B, Acute Tox. 4 *",,,True
60,,Benzidine,92-87-5,CAS,REACH Annex III,Suspected hazardous to the aquatic environment,Hazardous to the aquatic environment,,Aquatic,True


#### Add dataframe to devel CII
Add new substances to CII or update the ones that are already there

In [12]:
updater = updater.UpdateDB(host='localhost', dbname='inventory', user='postgres', password='PSGAdmin')

In [13]:
updater.add_substance_from_dataframe(dataframe=example_annotations,
                                    class_name_field = 'Sustancia',
                                    preferred_name_field = 'Listado individual',
                                    chem_id_field='CAS',
                                    chem_id_type='casr_number',
                                    sourceName_field='source_name',
                                    regulation_field='annotation')