In [1]:
import numpy as np
import pandas as pd

from compoundDB import inputtools as it
from UpdateDB import Checkpoint as cp
from UpdateDB import CR
from UpdateDB import Update_CII as updater


*** CompoundDB module found. Will check the synonyms table to resolve CAS. ***



#### Load list into pandas

In [2]:
example_df = pd.read_csv('example.csv', sep=',')

#### Checkpoint from CII release version
I make a checkpoint from the stable version just in case I need to roll back the new additions in my database

In [3]:
release_checkpoint = cp.Checkpoint('gea','CII','postgres','DBAdmin')

In [4]:
tables_checkpoint = release_checkpoint.get_max_id_for_each_table()

#### Processing example dataframe
First we get CAS and then we generate a list with unique CAS. Finally we'll get the annotations for each CAS from CR database.
Is important to remember that each input should be curated manually to end up having the same format obtained at the end with example_annotations dataframe.

In [5]:
example_df.loc[:,'N.o CAS'] = example_df['N.o CAS'].str.split('/')
example_df.loc[:,'N.o CAS'] = example_df['N.o CAS'].astype(str).apply(lambda x: x.strip('[').strip(']').replace("'","").strip().replace(' and several other',''))

In [6]:
example_copy = example_df.copy()

In [7]:
new_data = {'Sustancia':[],'Listado individual':[], 'N.o CAS':[]}
for i, row in example_df.iterrows():
    subs_row = row['Sustancia']
    pref_name = row['Listado individual']
    cas_row = row['N.o CAS']
    if ',' in cas_row:
        cas_list = cas_row.split(',')
        for cas in cas_list:
            new_data['Sustancia'].append(subs_row)
            new_data['Listado individual'].append(pref_name)
            new_data['N.o CAS'].append(cas.strip())
        example_copy.drop(i, axis=0, inplace=True)
new_df = pd.DataFrame(new_data)

In [8]:
example_conc = pd.concat([example_copy, new_df])
example_conc.reset_index(inplace=True)
example_conc.drop(labels='index',axis=1, inplace=True)
example_conc.rename(columns={'N.o CAS':'CAS'},inplace=True)

#### Connection to CR
I connect to CR and extract annotations for each CAS

In [None]:
ann_df = CR.CR().get_annotations_per_CAS()

#### Preparing dataframes with annotations
Here I create a new dataframe with CR annotations for each CAS in LSR. If I use pd.merge with how='inner' it allows me to recover all CAS from LSR dataframe avoiding empty substances

In [None]:
example_annotations = example_conc.merge(ann_df, on='CAS', how='inner')

In [None]:
example_annotations.sample(10)

#### Add dataframe to devel CII
Add new substances to CII or update the ones that are already there. I use original_annotation because it includes EUH annotations, and annotation column doesn't.

In [None]:
updater = updater.UpdateDB(host='localhost', dbname='cii_v0', user='postgres', password='PSGAdmin')

In [None]:
# updater.add_all_information_from_dataframe(dataframe=example_annotations,
#                                     class_name_field = 'Sustancia',
#                                     preferred_name_field = 'Listado individual',
#                                     chem_id_field='CAS',
#                                     chem_id_type='casr_number',
#                                     sourceName_field='source_name',
#                                     regulation_field='original_annotation')

In [None]:
# updater.add_substances_from_dataframe(dataframe=example_annotations,
#                                     class_name_field = 'Sustancia',
#                                     preferred_name_field = 'Listado individual')

In [None]:
# updater.add_chemical_identifier_from_dataframe(dataframe=example_annotations,
#                                             class_name_field = 'Sustancia',
#                                             preferred_name_field = 'Listado individual',
#                                             chem_id_field='CAS',
#                                             chem_id_type='casr_number')

In [None]:
# small_strucs = updater.get_substances_with_structure()[:10]

In [None]:
# updater.add_structure_from_dataframe(dataframe=small_strucs,
#                                     class_name_field = 'class_name',
#                                     preferred_name_field = 'preferred_name',
#                                     chem_id_field='name',
#                                     chem_id_type='casr_number',
#                                     smiles_field='structure')

In [None]:
updater.add_source_from_dataframe(dataframe=example_annotations,
                                 sourceName_field='source_name')

In [None]:
# updater.add_annotation_from_dataframe()