In [1]:
import numpy as np
import pandas as pd

from compoundDB import inputtools as it
from UpdateDB import Update_CII as updater


*** CompoundDB module found. Will check the synonyms table to resolve CAS. ***



#### Load list into pandas

In [2]:
example_df = pd.read_csv('example.csv', sep=',')

#### Processing example dataframe
First we get CAS and then we generate a list with unique CAS. Finally we'll get the annotations for each CAS from CR database.
Is important to remember that each input should be curated manually to end up having the same format obtained at the end with example_annotations dataframe.

In [3]:
example_df.loc[:,'N.o CAS'] = example_df['N.o CAS'].str.split('/')
example_df.loc[:,'N.o CAS'] = example_df['N.o CAS'].astype(str).apply(lambda x: x.strip('[').strip(']').replace("'","").strip().replace(' and several other',''))

In [4]:
example_copy = example_df.copy()

In [5]:
new_data = {'Sustancia':[],'Listado individual':[], 'N.o CAS':[]}
for i, row in example_df.iterrows():
    subs_row = row['Sustancia']
    pref_name = row['Listado individual']
    cas_row = row['N.o CAS']
    if ',' in cas_row:
        cas_list = cas_row.split(',')
        for cas in cas_list:
            new_data['Sustancia'].append(subs_row)
            new_data['Listado individual'].append(pref_name)
            new_data['N.o CAS'].append(cas.strip())
        example_copy.drop(i, axis=0, inplace=True)
new_df = pd.DataFrame(new_data)

In [6]:
example_conc = pd.concat([example_copy, new_df])
example_conc.reset_index(inplace=True)
example_conc.drop(labels='index',axis=1, inplace=True)
example_conc.rename(columns={'N.o CAS':'CAS'},inplace=True)

#### Connection to CR
I connect to CR and extract annotations for each CAS

In [7]:
cr_con = it.openconnection(host='gea', password='DBAdmin')

In [8]:
cr_db = pd.read_sql_query("""SELECT synonym.type, synonym.name as reg_number, source.name as source_name, 
subs_ann.original_annotation, annotation.annotation, annotation.general, annotation.category, source.latest
FROM substance sub
left join synonym on synonym.subsid = sub.id
left join source on source.id = sub.sourceid
left join subs_ann on subs_ann.subsid = sub.id
left join annotation on annotation.id = subs_ann.annid
where synonym.type like '%CAS%'
order by synonym.name ASC""", cr_con)

In [9]:
cr_db.drop_duplicates(inplace=True)
cr_db.drop(labels=cr_db.loc[cr_db['reg_number'].isin(['-','_','---','—']),:].index, axis=0, inplace=True)
cr_db.sort_values(by=['reg_number','source_name','original_annotation','annotation'], inplace=True)
cr_db.rename(columns={'reg_number':'CAS'},inplace=True)

#### Preparing dataframes with annotations
Here I create a new dataframe with CR annotations for each CAS in LSR. If I use pd.merge with how='left' it allows me to recover all CAS from LSR dataframe

In [10]:
example_annotations = example_conc.merge(cr_db, on='CAS', how='left')

In [11]:
example_annotations.sample(10)

Unnamed: 0,Sustancia,Listado individual,CAS,type,source_name,original_annotation,annotation,general,category,latest
11,Arilaminas,4-Aminobiphenyl,92-67-1,CAS,REACH Annex III,Harmonised classification for acute toxicity,Harmonised classification for acute toxicity,,Other,True
426,,"3,3’-Dichlorobenzidine",91-94-1,CAS No,REACH Annex VI,"Carc. 1B, Acute Tox. 4 *, Skin Sens. 1, Aquati...","Carc. 1B, Acute Tox. 4 *, Skin Sens. 1, Aquati...",,,True
135,,2-Naphthylamine,91-59-8,CAS,CLP Notification,Acute Tox. 4,Acute Tox. 4,Acute Toxicity,Other,False
305,,"2,4-Diaminoanisole",615-05-4,CAS,REACH Annex III,Harmonised classification for acute toxicity,Harmonised classification for acute toxicity,,Other,True
179,,o-Aminoazotoluene,97-56-3,CAS,CLP Notification,Aquatic Chronic 1,Aquatic Chronic 1,Aquatic Chronic,Aquatic,False
143,,2-Naphthylamine,91-59-8,CAS,REACH Annex III,Harmonised classification for acute toxicity,Acute toxicity,Acute Toxicity,Other,True
716,,o-Toluidine,95-53-4,CAS,CLP Notification,Skin Sens. 1,Skin Sens. 1,,Sensitiser,False
576,,p-Cresidine,120-71-8,CAS,SVHC,PBT,PBT,PBT,PBT,False
588,,"4,4’-Methylen-bis-(2-chloroaniline)",101-14-4,CAS,REACH Annex VI,Acute Tox. 4 *,Acute Tox. 4,Acute Toxicity,Other,False
340,,"4,4’-Diaminodiphenylmethane",101-77-9,CAS,CLP Notification,Carc. 1B,Carc. 1B,Carcinogen,CMR,False


#### Add dataframe to devel CII
Add new substances to CII or update the ones that are already there

In [12]:
updater = updater.UpdateDB(host='localhost', dbname='inventory', user='postgres', password='PSGAdmin')

In [13]:
updater.add_all_information_from_dataframe(dataframe=example_annotations,
                                    class_name_field = 'Sustancia',
                                    preferred_name_field = 'Listado individual',
                                    chem_id_field='CAS',
                                    chem_id_type='casr_number',
                                    sourceName_field='source_name',
                                    regulation_field='annotation')

In [14]:
# updater.add_substances_from_dataframe()
# updater.add_chemical_identifier_from_dataframe()
# updater.add_structure_from_dataframe()
# updater.add_source_from_dataframe()
# updater.add_annotation_from_dataframe()