In [1]:
import difflib
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import psycopg2
import random
import seaborn as sns
import USC_dataframe_generation as usc

from importlib import reload

In [2]:
conn = psycopg2.connect(host='localhost', dbname='inventory', user='postgres', password='PSGAdmin')
curs = conn.cursor()
psycopg2.extensions.register_adapter(np.int64, psycopg2._psycopg.AsIs)

In [3]:
heh_cmd = pd.read_sql_query("""SELECT subs_id, sub.class_name, sub.preferred_name, casr_number, ec_number, index_number, 
                                ht."type", hc.classification, hn."name"
                                FROM heh h
                                left join substance sub on sub.id = h.subs_id
                                left join heh_type ht on ht.id = h.heh_type_id
                                left join heh_classification hc on hc.id = h.heh_classif_id
                                left join heh_name hn on hn.id = h.heh_name_id""", conn)

heh_df = pd.DataFrame(heh_cmd, dtype=object)

In [4]:
reg_cmd = pd.read_sql_query("""SELECT reg.id, reg.subs_id, sub.class_name, sub.preferred_name, rco.country, rt."type", rg.general_regulation_name, 
                            rspec.specific_regulation_name, rsub.subspecific_regulation_name, rsc.special_cases_name, addr.additional_information_name, 
                            cid."name", ct."type", regn.names
                            FROM regulations reg
                            LEFT JOIN substance sub ON sub.id = reg.subs_id
                            left join regulation_country rco on rco.id = reg.reg_country_id
                            left join regulation_type rt on rt.id = reg.reg_type_id
                            left join general_regulation rg on rg.id = reg.gen_reg_id
                            left join specific_regulation rspec on rspec.id = reg.spec_reg_id
                            LEFT JOIN subspecific_regulation rsub ON rsub.id = reg.subspec_reg_id
                            left join special_cases_regulation rsc on rsc.id = reg.special_cases_id
                            left join additional_information_regulation addr on addr.id = reg.additional_information_id
                            LEFT JOIN chem_id cid ON cid.chem_type_id = reg.chem_type_id
                            LEFT JOIN chem_type ct ON ct.id = cid.chem_type_id
                            LEFT JOIN regulation_names regn ON regn.id = reg.regulation_id
                            order by reg.id asc""", conn)

reg_df = pd.DataFrame(reg_cmd, dtype=object)

In [5]:
substances = reg_df.subs_id.unique()

In [6]:
cmr_positive = heh_df.loc[(heh_df['type'] == 'CMR') & (~heh_df['name'].isin(['YES','NO','No information','Pending (1)',
                                                                            '(Shall apply from 1 March 2018)',
                                                                            '(31/03/2017)',
                                                                            'Registry of current Harmonised Classification and Labelling intentions by France',
                                                                            'Submitted intention by Netherlands (18/11/2015)',
                                                                            'Submitted intention by Norway (03/10/2011)',
                                                                            'Submitted intention by Germany (22/02/2011)',
                                                                            'Submitted intention by Norway (24/06/2014)',
                                                                            'Amendment 2016/1179', 'Pending','particle diameter < 1 mm)',
                                                                            'Submitted intention by Italy (22/08/2012)',
                                                                            'Submitted intention by Sweden (14/11/2014)'])), 'name'].unique()
pbt_ann = heh_df.loc[(heh_df['type'] == 'PBT') & (~heh_df['name'].isin(['YES','NO','No information','Pending (2)'])), 'name'].unique()
vpvb_ann = heh_df.loc[(heh_df['type'] == 'vPvB') & (~heh_df['name'].isin(['NO','Pending (2)','YES','No information'])), 'name'].unique()
endoc_ann = heh_df.loc[(heh_df['type'] == 'Endocrine_disruptor') & (~heh_df['name'].isin(['YES','NO','Pending (3)','No information'])), 'name'].unique()
sens_ann = heh_df.loc[(heh_df['type'] == 'Sensitiser') & (~heh_df['name'].isin(['YES','NO','No information','(Shall apply from 1 March 2018)',
                                                                                'Pending (3)','Pending(3)','Submitted intention by Slovenia (09/01/2015)',
                                                                               'Submitted intention by United Kingdom (01/02/2017)',
                                                                               'Amendment 2016/1179','(05/01/2016)','particle diameter < 1 mm)',
                                                                               'Proposed future entry in Annex VI of CLP Regulation',
                                                                               'Registry of current Harmonised Classification and Labelling intentions by Norway',
                                                                               'Removal of','SCL ≥ 0.06 %'])), 'name'].unique()
other_ann = heh_df.loc[(heh_df['type'] == 'Other') & (~heh_df['name'].isin(['No information'])), 'name'].unique()

In [7]:
hazards_ghs = pd.read_csv('hazards_list.csv', sep='\t')

In [8]:
hazards_ghs['Hazard class category'] = hazards_ghs[['Hazard class','Category']].apply(lambda x: ' '.join(x), axis=1)

In [9]:
hclas = hazards_ghs['Hazard class category'].unique()

In [10]:
h_statements = hazards_ghs['Hazard statements'].unique()

In [11]:
reg_cmd['general_regulation_name'].unique()

array(['reach', 'clp', 'pops', 'wfd', 'pbt_vpvb', 'endocrine_disruptors',
       'chemical_products_ordinance', 'german_commodity_ordinance',
       'chemicals_act_restriction', 'kemi'], dtype=object)

In [12]:
pbt_vpvb_pos = reg_cmd[(reg_cmd['general_regulation_name'].isin(['pbt_vpvb'])) &
                      (~reg_cmd['names'].isin(['No information', 'Not PBT', 'Not vPvB', 'No information available',
                                              '(See REACH part-Candidate List section)','Conclusion: not fulfilling PBT & vPvB criteria',
                                   'Conclusion: deferred the decision on the substance from the list of potential PBT or vPvB substances.',
                                              'Conclusion: not fulfilling PBT  criteria','Conclusion: not fulfilling POP  criteria']))].names.unique()

In [13]:
endoc_pos = reg_cmd[(reg_cmd['general_regulation_name'].isin(['endocrine_disruptors']))&
                   (~reg_cmd['names'].isin(['Not included','No information available']))].names.unique()

In [14]:
svhc_regulations = reg_cmd[(reg_cmd['specific_regulation_name'].isin(['svhc'])) &
                          (~reg_cmd['names'].isin(['No information', 'Not included','No relevant authorisation',
                           'No relevant application for authorisation','No relevant information','Status: Not recommended in this round']))].names.unique()

In [15]:
new_cmr = ['Toxic for reproduction (Article 57c)','Toxic for reproduction (Article 57c):','Toxic for reproduction','Scope: ED, CMR','Scope: CMR','Carcinogenic (Article 57a)',
          'Carcinogenic (Category 1B)','Carcinogenic  1B','Toxic for reproduction (category 1B)','Mutagenic (Article 57b)','Carcinogenic (category 1A)',
           'Mutagenic (category 1B)',]
new_pbt_vpvb = ['PBT (Article 57 d)','Scope: vPvB (Article 57 e)', 'vPvB (Article 57 e)','Scope: PBT']
other_new = ['Equivalent level of concern having probable serious effects to environment (Article 57 f):','Scope: EQC',
             'Equivalent level of concern having probable serious effects to environment (Article 57 f)']
new_endoc = ['Scope: ED, CMR']

cmr_positive = np.concatenate((cmr_positive, new_cmr), axis=0)
pbt_vpvb_pos = np.concatenate((pbt_vpvb_pos, new_pbt_vpvb), axis=0)
endoc_pos = np.concatenate((endoc_pos,new_endoc), axis=0)
other_ann = np.concatenate((other_ann, other_new), axis=0)

In [16]:
vpvb_pos = ['vPvB substance', 'PBT/ vPvB Substance',
       'PBT/ vPvB substance', 'Rapporteur: Austria', 'Rapporteur: Norway',
       'Rapporteur: United Kingdom', 'Rapporteur: Denmark',
       'Rapporteur: Spain', 'Rapporteur: Germany',
       'Rapporteur: France',
       'Conclusion: Under evaluation', 'Rapporteur: Netherlands',
       'Rapporteur: Sweden',
       'Conclusion: fulfilling PBT & vPvB criteria',
       'Conclusion: fulfilling PBT & vPvB criteria & POP']

In [17]:
pbt_pos = ['PBT Substance', 'PBT/ vPvB Substance',
       'PBT/ vPvB substance', 'Rapporteur: Austria', 'Rapporteur: Norway',
       'Rapporteur: United Kingdom', 'Rapporteur: Denmark',
       'Rapporteur: Spain', 'Rapporteur: Germany',
       'Conclusion: Fulfilling PBT criteria & POP','Rapporteur: France',
       'Conclusion: Under evaluation', 'Rapporteur: Netherlands',
       'Conclusion: fulfilling PBT criteria', 'Rapporteur: Sweden',
       'Conclusion: fulfilling PBT & vPvB criteria',
       'Conclusion: fulfilling PBT & vPvB criteria & POP']

In [18]:
notrd_names = reg_cmd[reg_cmd['special_cases_name'].isin(['notification','registration_dossier'])].names.unique()

In [19]:
hstate_names = reg_cmd[reg_cmd['additional_information_name'].isin(['Submitted SVHC intentions',
                  'Withdrawn SVHC intentions and submissions','Amendment 2016/1179'])].names.unique()

In [20]:
elements_to_add = []
elements_to_remove = []
for i,element in enumerate(hclas):
    if 'mutagenicity' in element:
        elements_to_remove.append(element)
        muta = element.split()[2]
        category = element.split()[3]
        if len(category.split('/')) > 1:
            for cat in category.split('/'):
                el_to_add = ' '.join([muta.capitalize(),cat])
                elements_to_add.append(el_to_add)
        else:
            el_to_add = ' '.join([muta.capitalize(),category])
            elements_to_add.append(el_to_add)
    elif 'single exposure' in element:
        hclas[i] = element.replace('- single exposure','SE')
    elif 'repeated exposure' in element:
        hclas[i] = element.replace('- repeated exposure','RE')
    elif '/' in element:
        elements_to_remove.append(element)
        if 'Skin' in element:
            organ = 'Skin'
            hazard_type = element.split()[1]
            category = element.split()[2]
            if len(hazard_type.split('/')) > 1:
                for ht in hazard_type.split('/'):
                    el_to_add = ' '.join([organ,ht,category])
                    elements_to_add.append(el_to_add)
            elif len(category.split('/')) > 1:
                for cat in category.split('/'):
                    el_to_add = ' '.join([organ,hazard_type,cat])
                    elements_to_add.append(el_to_add)
        elif 'eye' in element:
            organ = 'Eye'
            hazard_type = element.split()[2]
            category = element.split()[3]
            for ht in hazard_type.split('/'):
                el_to_add = ' '.join([organ,ht,category])
                elements_to_add.append(el_to_add)
        else:
            category = element.split()[-1]
            for cat in category.split('/'):
                organ_hazard_cat = element.split()[:-1]
                organ_hazard_cat.append(cat)
                el_to_add = ' '.join(organ_hazard_cat)
                elements_to_add.append(el_to_add)

clean_hclas = np.setdiff1d(hclas, elements_to_remove)
clean_hclas = np.concatenate((clean_hclas, elements_to_add), axis=0)

In [21]:
def string_comparison(s1: str, s2: str) -> bool:
    s1_trimed = s1.replace(" ", "").replace(".","").replace(",","").replace("-","").lower()
    s2_trimed = s2.replace(" ", "").replace(".","").replace(",","").replace("-","").lower()
    print(s1_trimed, s2_trimed)
    return s1_trimed in s2_trimed

In [22]:
def get_max_score_words(original_word, new_word_list):
    for word in new_word_list:
        yield(original_word, word, difflib.SequenceMatcher(None, original_word, word).ratio())

In [23]:
# for name in notrd_names:
#     close_matches = difflib.get_close_matches(name, clean_hclas, cutoff=0.5)
#     if close_matches:
#         print(max(get_max_score_words(name,close_matches), key = lambda x: x[2]))

#### try updated dataframes to generate new dfs

In [24]:
reload(usc)

<module 'USC_dataframe_generation' from '/home/emarch/Documents/Inditex/USC_workflow_reproduction/USC_dataframe_generation.py'>

In [25]:
# %%time
# cmr_dict = usc.fixed_generate_dataframe(substance_id_list=substances,endpoint='CMR', regulations_df=reg_df, endpoint_annotations=cmr_positive)
# pbt_dict = usc.fixed_generate_dataframe(substance_id_list=substances,endpoint='PBT', regulations_df=reg_df, endpoint_annotations=pbt_pos)
# vpvb_dict = usc.fixed_generate_dataframe(substance_id_list=substances,endpoint='vPvB', regulations_df=reg_df, endpoint_annotations=vpvb_pos)
# endoc_dict = usc.fixed_generate_dataframe(substance_id_list=substances,endpoint='Endocrine_disruptor', regulations_df=reg_df, endpoint_annotations=endoc_pos)
# sens_dict = usc.fixed_generate_dataframe(substance_id_list=substances,endpoint='Sensitiser', regulations_df=reg_df, endpoint_annotations=sens_ann)
# other_dict = usc.fixed_generate_dataframe(substance_id_list=substances,endpoint='Other', regulations_df=reg_df, endpoint_annotations=other_ann)

In [26]:
# cmr_df = pd.DataFrame(data=cmr_dict)
# pbt_df = pd.DataFrame(data=pbt_dict)
# vpvb_df = pd.DataFrame(data=vpvb_dict)
# endoc_df = pd.DataFrame(data=endoc_dict)
# sens_df = pd.DataFrame(data=sens_dict)
# other_df = pd.DataFrame(data=other_dict)

In [27]:
# to_concat_df = [cmr_df, pbt_df, vpvb_df,endoc_df,sens_df]
# concat_df = pd.concat(to_concat_df)

In [28]:
# for subs_id in substances:
#     if other_df[(other_df['subs_id'] == subs_id) & (other_df['name'].isin(['NO','No information']))].empty:
#         other_df.loc[other_df['subs_id'] == subs_id,:] = other_df.loc[(other_df['subs_id'] == subs_id) & 
#                                                                       (~other_df['name'].isin(concat_df[concat_df['subs_id'] == subs_id].name.unique())),:]

In [29]:
# new_other_df = other_df.dropna().reset_index().drop(columns=['index'])

In [30]:
# final_to_concat =[concat_df, new_other_df]
# final_df = pd.concat(final_to_concat)
# sorted_concat_df = final_df.sort_values(by=['subs_id', 'endpoint_type']).reset_index().drop(columns=['index'])

In [31]:
# sorted_concat_df.to_pickle('fixed_generated_heh_df.pkl')

In [32]:
sorted_concat_df = pd.read_pickle('fixed_generated_heh_df.pkl')

In [33]:
endpoint_list = ['CMR','PBT','vPvB','Endocrine_disruptor','Sensitiser','Other']

In [34]:
ann_info_list = ['YES','NO','No information','Pending', 'Pending (1)','Pending (2)',
                    'Pending (3)', 'Pending (3a)','Pending (3b)','Pending (3c)','Pending (4)']

In [35]:
for ep in endpoint_list:
    ep_dict = {'Anotaciones':[],'Inventario':[],'Generadas':[]}
    for ann in ann_info_list:
        total_original = heh_df.loc[(heh_df['type'] == ep) & (heh_df['name'] == ann),'name'].count()
        total_generadas = sorted_concat_df.loc[(sorted_concat_df['endpoint_type'] == ep) & (sorted_concat_df['name'] == ann),'name'].count()
        ep_dict['Anotaciones'].append(ann)
        ep_dict['Inventario'].append(total_original)
        ep_dict['Generadas'].append(total_generadas)
    
    ep_df = pd.DataFrame(data=ep_dict,dtype=object)
    ep_df.set_index('Anotaciones')
#     if os.path.isfile('Comparación_anotaciones_nuevas.xlsx'):
#         with pd.ExcelWriter('Comparación_anotaciones_nuevas.xlsx', mode='a') as writer:
#              ep_df.to_excel(writer, sheet_name=ep)
#     else:
#         with pd.ExcelWriter('Comparación_anotaciones_nuevas.xlsx', mode='w') as writer:
#              ep_df.to_excel(writer, sheet_name=ep)

### Get regulations in which I can find the endpoints

#### CMR

In [43]:
reg_cmd.loc[reg_cmd['names'].isin(cmr_positive),'general_regulation_name'].unique()

array(['clp', 'reach'], dtype=object)

In [44]:
reg_cmd.loc[reg_cmd['names'].isin(cmr_positive),'specific_regulation_name'].unique()

array(['preliminary_stage', 'harmonised_c&l', 'substance_evaluation',
       'svhc'], dtype=object)

In [45]:
reg_cmd.loc[reg_cmd['names'].isin(cmr_positive),'subspecific_regulation_name'].unique()

array(['c&l_inventory', 'hazard_class', 'general_evaluaiton',
       'candidate_list', 'spec_concentration_units', 'authorisation_list',
       'annex_xiii', 'harmonised_c&l_intentions', 'hazard'], dtype=object)

In [46]:
reg_cmd.loc[reg_cmd['names'].isin(cmr_positive),'special_cases_name'].unique()

array(['registration_dossier', 'notification', None, 'candidate_list',
       'authorisation_annex_xiv',
       'preliminary_stages_inclusion_candidate_list'], dtype=object)

In [47]:
reg_cmd.loc[reg_cmd['names'].isin(cmr_positive),'additional_information_name'].unique()

array(['Self-classification', None, 'PACT List', 'Amendment 2016/1179',
       'Suspected', 'Entry in Annex VI of CLP Regulation',
       'Submitted SVHC intentions',
       'Suspected persistent in the environment'], dtype=object)

In [80]:
reg_cmd.loc[reg_cmd['names'].isin(cmr_positive),['general_regulation_name','specific_regulation_name',
                                                 'subspecific_regulation_name','special_cases_name',
                                                 'additional_information_name']].drop_duplicates().sort_values(by=['general_regulation_name',
                                                                                                                   'specific_regulation_name',
                                                                                                                  'subspecific_regulation_name',
                                                                                                                  'special_cases_name',
                                                                                                                  'additional_information_name'])

Unnamed: 0,general_regulation_name,specific_regulation_name,subspecific_regulation_name,special_cases_name,additional_information_name
879765,clp,harmonised_c&l,hazard,,
951,clp,harmonised_c&l,hazard_class,,Amendment 2016/1179
34,clp,harmonised_c&l,hazard_class,,
301,clp,harmonised_c&l,spec_concentration_units,,
27,clp,preliminary_stage,c&l_inventory,notification,Self-classification
22,clp,preliminary_stage,c&l_inventory,registration_dossier,Self-classification
45395,clp,preliminary_stage,harmonised_c&l_intentions,,Entry in Annex VI of CLP Regulation
1002,reach,substance_evaluation,annex_xiii,,Suspected
386117,reach,substance_evaluation,annex_xiii,,Suspected persistent in the environment
388858,reach,substance_evaluation,annex_xiii,,


#### PBT

In [48]:
reg_cmd.loc[reg_cmd['names'].isin(pbt_pos),'general_regulation_name'].unique()

array(['pbt_vpvb'], dtype=object)

In [49]:
reg_cmd.loc[reg_cmd['names'].isin(pbt_pos),'specific_regulation_name'].unique()

array(['reach_criteria_annex_xiii', 'assessment_before_reach'],
      dtype=object)

In [50]:
reg_cmd.loc[reg_cmd['names'].isin(pbt_pos),'subspecific_regulation_name'].unique()

array(['annex_xiii_registrant_information', 'annex_xiii_candidate_list',
       None], dtype=object)

In [51]:
reg_cmd.loc[reg_cmd['names'].isin(pbt_pos),'special_cases_name'].unique()

array([None], dtype=object)

In [52]:
reg_cmd.loc[reg_cmd['names'].isin(pbt_pos),'additional_information_name'].unique()

array([None], dtype=object)

In [81]:
reg_cmd.loc[reg_cmd['names'].isin(pbt_pos),['general_regulation_name','specific_regulation_name',
                                                 'subspecific_regulation_name','special_cases_name',
                                                 'additional_information_name']].drop_duplicates().sort_values(by=['general_regulation_name',
                                                                                                                   'specific_regulation_name',
                                                                                                                  'subspecific_regulation_name',
                                                                                                                  'special_cases_name',
                                                                                                                  'additional_information_name'])

Unnamed: 0,general_regulation_name,specific_regulation_name,subspecific_regulation_name,special_cases_name,additional_information_name
52851,pbt_vpvb,assessment_before_reach,,,
50712,pbt_vpvb,reach_criteria_annex_xiii,annex_xiii_candidate_list,,
974,pbt_vpvb,reach_criteria_annex_xiii,annex_xiii_registrant_information,,


#### vPvB

In [54]:
reg_cmd.loc[reg_cmd['names'].isin(vpvb_pos),'general_regulation_name'].unique()

array(['pbt_vpvb'], dtype=object)

In [55]:
reg_cmd.loc[reg_cmd['names'].isin(vpvb_pos),'specific_regulation_name'].unique()

array(['reach_criteria_annex_xiii', 'assessment_before_reach'],
      dtype=object)

In [56]:
reg_cmd.loc[reg_cmd['names'].isin(vpvb_pos),'subspecific_regulation_name'].unique()

array(['annex_xiii_registrant_information', 'annex_xiii_candidate_list',
       None], dtype=object)

In [58]:
reg_cmd.loc[reg_cmd['names'].isin(vpvb_pos),'special_cases_name'].unique()

array([None], dtype=object)

In [59]:
reg_cmd.loc[reg_cmd['names'].isin(vpvb_pos),'additional_information_name'].unique()

array([None], dtype=object)

In [82]:
reg_cmd.loc[reg_cmd['names'].isin(vpvb_pos),['general_regulation_name','specific_regulation_name',
                                                 'subspecific_regulation_name','special_cases_name',
                                                 'additional_information_name']].drop_duplicates().sort_values(by=['general_regulation_name',
                                                                                                                   'specific_regulation_name',
                                                                                                                  'subspecific_regulation_name',
                                                                                                                  'special_cases_name',
                                                                                                                  'additional_information_name'])

Unnamed: 0,general_regulation_name,specific_regulation_name,subspecific_regulation_name,special_cases_name,additional_information_name
52851,pbt_vpvb,assessment_before_reach,,,
50712,pbt_vpvb,reach_criteria_annex_xiii,annex_xiii_candidate_list,,
975,pbt_vpvb,reach_criteria_annex_xiii,annex_xiii_registrant_information,,


#### Endocrine disruptor

In [60]:
reg_cmd.loc[reg_cmd['names'].isin(endoc_pos),'general_regulation_name'].unique()

array(['endocrine_disruptors', 'wfd', 'reach', 'pops'], dtype=object)

In [61]:
reg_cmd.loc[reg_cmd['names'].isin(endoc_pos),'specific_regulation_name'].unique()

array(['european_comission', 'priority_substances_water_policy', 'svhc',
       'biocidal_cmr_substance', 'prohibited_production_annex_i',
       'list_substances_waste_management', 'waste_management',
       'list_substances_release_reduction_provisions'], dtype=object)

In [62]:
reg_cmd.loc[reg_cmd['names'].isin(endoc_pos),'subspecific_regulation_name'].unique()

array([None, 'candidate_list', 'part_a', 'part_b'], dtype=object)

In [63]:
reg_cmd.loc[reg_cmd['names'].isin(endoc_pos),'special_cases_name'].unique()

array([None, 'preliminary_stages_inclusion_candidate_list'], dtype=object)

In [64]:
reg_cmd.loc[reg_cmd['names'].isin(endoc_pos),'additional_information_name'].unique()

array([None, 'Submitted SVHC intentions'], dtype=object)

In [83]:
reg_cmd.loc[reg_cmd['names'].isin(endoc_pos),['general_regulation_name','specific_regulation_name',
                                                 'subspecific_regulation_name','special_cases_name',
                                                 'additional_information_name']].drop_duplicates().sort_values(by=['general_regulation_name',
                                                                                                                   'specific_regulation_name',
                                                                                                                  'subspecific_regulation_name',
                                                                                                                  'special_cases_name',
                                                                                                                  'additional_information_name'])

Unnamed: 0,general_regulation_name,specific_regulation_name,subspecific_regulation_name,special_cases_name,additional_information_name
50716,endocrine_disruptors,biocidal_cmr_substance,,,
166,endocrine_disruptors,european_comission,,,
256216,pops,list_substances_release_reduction_provisions,,,
50783,pops,list_substances_waste_management,,,
50778,pops,prohibited_production_annex_i,part_a,,
257059,pops,prohibited_production_annex_i,part_b,,
50785,pops,waste_management,,,
50433,reach,svhc,candidate_list,preliminary_stages_inclusion_candidate_list,Submitted SVHC intentions
804,wfd,priority_substances_water_policy,,,


#### Sensitiser

In [65]:
reg_cmd.loc[reg_cmd['names'].isin(sens_ann),'general_regulation_name'].unique()

array(['clp'], dtype=object)

In [66]:
reg_cmd.loc[reg_cmd['names'].isin(sens_ann),'specific_regulation_name'].unique()

array(['preliminary_stage', 'harmonised_c&l'], dtype=object)

In [67]:
reg_cmd.loc[reg_cmd['names'].isin(sens_ann),'subspecific_regulation_name'].unique()

array(['c&l_inventory', 'hazard_class', 'spec_concentration_units',
       'harmonised_c&l_intentions', 'hazard'], dtype=object)

In [68]:
reg_cmd.loc[reg_cmd['names'].isin(sens_ann),'special_cases_name'].unique()

array(['registration_dossier', 'notification', None], dtype=object)

In [69]:
reg_cmd.loc[reg_cmd['names'].isin(sens_ann),'additional_information_name'].unique()

array(['Self-classification', None, 'Amendment 2016/1179',
       'Entry in Annex VI of CLP Regulation'], dtype=object)

In [84]:
reg_cmd.loc[reg_cmd['names'].isin(sens_ann),['general_regulation_name','specific_regulation_name',
                                                 'subspecific_regulation_name','special_cases_name',
                                                 'additional_information_name']].drop_duplicates().sort_values(by=['general_regulation_name',
                                                                                                                   'specific_regulation_name',
                                                                                                                  'subspecific_regulation_name',
                                                                                                                  'special_cases_name',
                                                                                                                  'additional_information_name'])

Unnamed: 0,general_regulation_name,specific_regulation_name,subspecific_regulation_name,special_cases_name,additional_information_name
609339,clp,harmonised_c&l,hazard,,
50464,clp,harmonised_c&l,hazard_class,,Amendment 2016/1179
40,clp,harmonised_c&l,hazard_class,,
57,clp,harmonised_c&l,spec_concentration_units,,
26,clp,preliminary_stage,c&l_inventory,notification,Self-classification
20,clp,preliminary_stage,c&l_inventory,registration_dossier,Self-classification
96019,clp,preliminary_stage,harmonised_c&l_intentions,,Entry in Annex VI of CLP Regulation


#### Other

In [70]:
reg_cmd.loc[reg_cmd['names'].isin(other_ann),'general_regulation_name'].unique()

array(['clp', 'reach', 'wfd', 'endocrine_disruptors', 'pops'],
      dtype=object)

In [71]:
reg_cmd.loc[reg_cmd['names'].isin(other_ann),'specific_regulation_name'].unique()

array(['preliminary_stage', 'harmonised_c&l', 'svhc',
       'priority_substances_water_policy', 'biocidal_cmr_substance',
       'prohibited_production_annex_i',
       'list_substances_waste_management', 'waste_management',
       'list_substances_release_reduction_provisions'], dtype=object)

In [72]:
reg_cmd.loc[reg_cmd['names'].isin(other_ann),'subspecific_regulation_name'].unique()

array(['c&l_inventory', 'hazard_class', 'spec_concentration_units',
       'candidate_list', None, 'hazard', 'pictogram', 'signal_word',
       'harmonised_c&l_intentions', 'part_a', 'part_b', 'index_number',
       'supplementary_hazard'], dtype=object)

In [73]:
reg_cmd.loc[reg_cmd['names'].isin(other_ann),'special_cases_name'].unique()

array(['registration_dossier', 'notification', None, 'candidate_list',
       'preliminary_stages_inclusion_candidate_list'], dtype=object)

In [74]:
reg_cmd.loc[reg_cmd['names'].isin(other_ann),'additional_information_name'].unique()

array(['Self-classification', None, 'Amendment 2016/1179',
       'Entry in Annex VI of CLP Regulation', 'Submitted SVHC intentions'],
      dtype=object)

In [85]:
reg_cmd.loc[reg_cmd['names'].isin(other_ann),['general_regulation_name','specific_regulation_name',
                                                 'subspecific_regulation_name','special_cases_name',
                                                 'additional_information_name']].drop_duplicates().sort_values(by=['general_regulation_name',
                                                                                                                   'specific_regulation_name',
                                                                                                                  'subspecific_regulation_name',
                                                                                                                  'special_cases_name',
                                                                                                                  'additional_information_name'])

Unnamed: 0,general_regulation_name,specific_regulation_name,subspecific_regulation_name,special_cases_name,additional_information_name
954,clp,harmonised_c&l,hazard,,Amendment 2016/1179
991,clp,harmonised_c&l,hazard,,
951,clp,harmonised_c&l,hazard_class,,Amendment 2016/1179
34,clp,harmonised_c&l,hazard_class,,
388392,clp,harmonised_c&l,index_number,,Amendment 2016/1179
388230,clp,harmonised_c&l,index_number,,
983,clp,harmonised_c&l,pictogram,,
987,clp,harmonised_c&l,signal_word,,
48,clp,harmonised_c&l,spec_concentration_units,,
608920,clp,harmonised_c&l,supplementary_hazard,,
