In [1]:
import pandas as pd
import re
import ast

# Define the matches_pattern function
def matches_pattern(s):
    pattern = r'^(ANDA\d{6}|NDA\d{6})$'
    if isinstance(s, str):
        return bool(re.match(pattern, s))
    return False

# Sample data
margaret_no_pub = pd.read_csv('margaret_task1_pub_identified.csv')
cortellis_matched = pd.read_csv('3_inxight_data_final.csv')
cortellis_matched_filtered = cortellis_matched[cortellis_matched['hightest_status_source'].apply(matches_pattern) | cortellis_matched['earliest_status_source'].apply(matches_pattern)]

# Function to parse the string to a list
def parse_string_to_list(s):
    return ast.literal_eval(s)

# Apply the function to parse the string to a list
margaret_no_pub['fda_links'] = margaret_no_pub['fda_links'].apply(parse_string_to_list)

# Flatten the lists and create a DataFrame for merging
margaret_no_pub_expanded = margaret_no_pub.explode('fda_links').reset_index(drop=True)

# Function to merge DataFrames
def merge_dfs(margaret_df, cortellis_df):
    merged_data = []

    for idx, row in cortellis_df.iterrows():
        high_status = row['hightest_status_source']
        early_status = row['earliest_status_source']

        if matches_pattern(high_status):
            match_value = high_status
            source = 'hightest_status_source'
        elif matches_pattern(early_status):
            match_value = early_status
            source = 'earliest_status_source'
        else:
            continue

        matched_rows = margaret_df[margaret_df['fda_links'] == match_value]

        for _, matched_row in matched_rows.iterrows():
            merged_data.append({
                **row.to_dict(),
                **matched_row.to_dict(),
                'source_used': source
            })

    return pd.DataFrame(merged_data)

# Merge the DataFrames
result_df = merge_dfs(margaret_no_pub_expanded, cortellis_matched_filtered)

print(result_df)

  original_molecule   parsed_molecule  cas  \
0      DALFOPRISTIN      DALFOPRISTIN  NaN   
1      DALFOPRISTIN      DALFOPRISTIN  NaN   
2        DRONABINOL        DRONABINOL  NaN   
3      FOSPHENYTOIN      FOSPHENYTOIN  NaN   
4  N-ACETYLTYROSINE  N-ACETYLTYROSINE  NaN   
5         PHENYTOIN         PHENYTOIN  NaN   
6           TAURINE           TAURINE  NaN   

                          best_result_url  results  \
0  https://drugs.ncats.io/drug/R9M4FJE48E        1   
1  https://drugs.ncats.io/drug/R9M4FJE48E        1   
2  https://drugs.ncats.io/drug/7J8897W37S        1   
3  https://drugs.ncats.io/drug/B4SF212641        1   
4  https://drugs.ncats.io/drug/DA8G610ZO5        1   
5  https://drugs.ncats.io/drug/6158TKW0C5        1   
6  https://drugs.ncats.io/drug/1EQV5MLY3D        1   

                                           query_url query_type  \
0  https://drugs.ncats.io/substances?q=(root_code...   ONLY CAS   
1  https://drugs.ncats.io/substances?q=(root_code...   ONLY CAS 

In [2]:
result_df

Unnamed: 0,original_molecule,parsed_molecule,cas,best_result_url,results,query_url,query_type,highest_status_year,hightest_status_source,hightest_status_source_url,...,mesh,Parsed Molecule,Link,Result Type,CAS,Deprecated CAS,Compound/Substance,components,component_cas,source_used
0,DALFOPRISTIN,DALFOPRISTIN,,https://drugs.ncats.io/drug/R9M4FJE48E,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,US Approved Rx 1999,NDA050748,https://www.accessdata.fda.gov/scripts/cder/da...,...,,DALFOPRISTIN MESYLATE,https://pubchem.ncbi.nlm.nih.gov/compound/9161...,FEATURED,,,COMPOUND,['https://pubchem.ncbi.nlm.nih.gov/compound/63...,"[112362-50-2], [75-75-2]",hightest_status_source
1,DALFOPRISTIN,DALFOPRISTIN,,https://drugs.ncats.io/drug/R9M4FJE48E,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,US Approved Rx 1999,NDA050748,https://www.accessdata.fda.gov/scripts/cder/da...,...,,QUINUPRISTIN MESYLATE,https://pubchem.ncbi.nlm.nih.gov/compound/9047...,FEATURED,,,COMPOUND,['https://pubchem.ncbi.nlm.nih.gov/compound/63...,"[75-75-2], [120138-50-3]",hightest_status_source
2,DRONABINOL,DRONABINOL,,https://drugs.ncats.io/drug/7J8897W37S,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,US Approved Rx 1985,NDA018651,https://www.accessdata.fda.gov/scripts/cder/da...,...,,DRONABINOL HEMISUCCINATE,https://pubchem.ncbi.nlm.nih.gov/compound/7696...,FEATURED,,,COMPOUND,['https://pubchem.ncbi.nlm.nih.gov/compound/11...,"[110-15-6], [1972-08-3, 6465-30-1]",hightest_status_source
3,FOSPHENYTOIN,FOSPHENYTOIN,,https://drugs.ncats.io/drug/B4SF212641,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,US Approved Rx 2006,ANDA040684,https://www.accessdata.fda.gov/scripts/cder/da...,...,,FOSPHENYTOIN SODIUM HEPTAHYDRATE,https://pubchem.ncbi.nlm.nih.gov/compound/4617...,FEATURED,,,COMPOUND,['https://pubchem.ncbi.nlm.nih.gov/compound/96...,"[7732-18-5, 17778-80-2, 3352-57-6], [93390-81-...",hightest_status_source
4,N-ACETYLTYROSINE,N-ACETYLTYROSINE,,https://drugs.ncats.io/drug/DA8G610ZO5,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,Possibly Marketed Outside US,Aminosyn II by Icu Medical Canada Inc [Canada],https://www.drugbank.ca/drugs/DB11102,...,,POTASSIUM TAURINE LAURATE,https://pubchem.ncbi.nlm.nih.gov/compound/7696...,FEATURED,,,COMPOUND,['https://pubchem.ncbi.nlm.nih.gov/compound/54...,"[7440-09-7], [143-07-7, 203714-07-2, 7632-48-6...",earliest_status_source
5,PHENYTOIN,PHENYTOIN,,https://drugs.ncats.io/drug/6158TKW0C5,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,US Approved Rx 2006,ANDA040684,https://www.accessdata.fda.gov/scripts/cder/da...,...,,FOSPHENYTOIN SODIUM HEPTAHYDRATE,https://pubchem.ncbi.nlm.nih.gov/compound/4617...,FEATURED,,,COMPOUND,['https://pubchem.ncbi.nlm.nih.gov/compound/96...,"[7732-18-5, 17778-80-2, 3352-57-6], [93390-81-...",hightest_status_source
6,TAURINE,TAURINE,,https://drugs.ncats.io/drug/1EQV5MLY3D,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,Possibly Marketed Outside US,Smofkabiven Electrolyte Free by Fresenius Kabi...,https://www.drugbank.ca/drugs/DB01956,...,,POTASSIUM TAURINE LAURATE,https://pubchem.ncbi.nlm.nih.gov/compound/7696...,FEATURED,,,COMPOUND,['https://pubchem.ncbi.nlm.nih.gov/compound/54...,"[7440-09-7], [143-07-7, 203714-07-2, 7632-48-6...",earliest_status_source


In [3]:
result_df.to_csv('task1_pub_merge.csv', index = False)