# Preparing API calls code for webapp

input: pandas dataframe with INN, therapeutic areas and current date

output: input dataframe with additional columns on trial data and pubmed conclusions 

In [61]:
import pandas as pd
from pharmatools.clinical_trials import get_trial_data
from pharmatools.pubmed import get_pubmed_ids, get_titles_abstracts_batch
from nltk.tokenize import sent_tokenize
%load_ext autoreload
%autoreload 2

## Defining functions

In [48]:
def trial_data(df):
    # convert to datetime
    df['First published'] = pd.to_datetime(df['First published'])

    # prepare dataframe
    df['n_trials'] = 0
    df['status_not_yet_recruiting'] = 0
    df['status_recruiting'] = 0
    df['status_enrolling_by_invitation'] = 0
    df['status_active_not_recruiting'] = 0
    df['status_suspended'] = 0
    df['status_terminated'] = 0
    df['status_completed'] = 0
    df['status_withdrawn'] = 0
    df['status_unknown'] = 0

    df['org_fed'] = 0
    df['org_indiv'] = 0
    df['org_industry'] = 0
    df['org_network'] = 0
    df['org_nih'] = 0
    df['org_other'] = 0
    df['org_other_gov'] = 0

    df['phase_early_1'] = 0
    df['phase_not_applicable'] = 0
    df['phase_1'] = 0
    df['phase_2'] = 0
    df['phase_3'] = 0
    df['phase_4'] = 0

    # pull data from API into dataframe
    for index, row in df.iterrows():

        print(f'fetching trial data for {index}, {row["INN"]}')

        # call ClinicalTrials API
        try:
            data = get_trial_data(row['INN'], row['Therapeutic area'], row['First published'])

            # update dataframe
            df['n_trials'][index] = data['n_trials']

            df['status_not_yet_recruiting'][index] = data['status']['Not yet recruiting']
            df['status_recruiting'][index] = data['status']['Recruiting']
            df['status_enrolling_by_invitation'][index] = data['status']['Enrolling by invitation']
            df['status_active_not_recruiting'][index] = data['status']['Active, not recruiting']
            df['status_suspended'][index] = data['status']['Suspended']
            df['status_terminated'][index] = data['status']['Terminated']
            df['status_completed'][index] = data['status']['Completed']
            df['status_withdrawn'][index] = data['status']['Withdrawn']
            df['status_unknown'][index] = data['status']['Unknown status']

            df['org_fed'][index] = data['organizers']['FED']
            df['org_indiv'][index] = data['organizers']['INDIV']
            df['org_industry'][index] = data['organizers']['INDUSTRY']
            df['org_network'][index] = data['organizers']['NETWORK']
            df['org_nih'][index] = data['organizers']['NIH']
            df['org_other'][index] = data['organizers']['OTHER']
            df['org_other_gov'][index] = data['organizers']['OTHER_GOV']

            df['phase_early_1'][index] = data['phases']['Early Phase 1']
            df['phase_not_applicable'][index] = data['phases']['Not Applicable']
            df['phase_1'][index] = data['phases']['Phase 1']
            df['phase_2'][index] = data['phases']['Phase 2']
            df['phase_3'][index] = data['phases']['Phase 3']
            df['phase_4'][index] = data['phases']['Phase 4']
        except:
            raise f"fetching trial data for {index} not successful"

    return df

In [49]:
def pubmed_abstracts(df):
    df_abstracts = pd.DataFrame(columns=['id', 'abstract'])
    df["pm_results"] = 0
    
    for index, row in df.iterrows():
        print(f"index: {index}, INN: {row['INN']}")
        ids = get_pubmed_ids(row['INN'], row['Therapeutic area'], row['First published'])
        n_ids = len(ids)
        print(f'results: {n_ids}')
        df['pm_results'][index] = n_ids
        
        # fetch abstracts of first 200 results
        titles, abstracts = get_titles_abstracts_batch(ids[:200])
        print(f'# of abstracts: {len(abstracts)}')
        for abstract in abstracts:
            df_abstracts = df_abstracts.append(pd.DataFrame.from_dict({'id': [index], 'abstract': [abstract]}), ignore_index=True)
        print('')
        
        return df_abstracts
         

In [50]:
def n_last_senteces(text, n):
    return ' '.join(sent_tokenize(text)[-n:])

In [51]:
def pubmed_conclusions(df):
    df_abstracts = pubmed_abstracts(df)
    
    df["conclusions"] = ""
    for index, row in df.iterrows():
        print(index)
        conclusions = ""
        for _, row_abstr in df_abstracts.loc[df_abstracts['id'] == index].iterrows():
            conclusion = n_last_senteces(row_abstr['abstract'], 2)
            conclusions += " " + conclusion
        df['conclusions'][index] = conclusions    
    
    return df

## testing

In [52]:
df = pd.DataFrame.from_dict({"INN": "rivaroxaban",
                  "Therapeutic area": "pulmonary embolism, TVT",
                  "First published": "2018-01-01"},
                           orient="index")

In [53]:
df = df.T

In [54]:
df

Unnamed: 0,INN,Therapeutic area,First published
0,rivaroxaban,"pulmonary embolism, TVT",2018-01-01


In [55]:
df = trial_data(df)

fetching trial data for 0, rivaroxaban


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

In [56]:
df

Unnamed: 0,INN,Therapeutic area,First published,n_trials,status_not_yet_recruiting,status_recruiting,status_enrolling_by_invitation,status_active_not_recruiting,status_suspended,status_terminated,...,org_network,org_nih,org_other,org_other_gov,phase_early_1,phase_not_applicable,phase_1,phase_2,phase_3,phase_4
0,rivaroxaban,"pulmonary embolism, TVT",2018-01-01,118,8,24,1,9,1,4,...,0,0,72,2,0,10,1,15,41,14


In [57]:
df_abstracts = pubmed_data(df)

index: 0, INN: rivaroxaban
results: 524


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


no abstract
no abstract
# of abstracts: 172



In [58]:
df_abstracts

Unnamed: 0,id,abstract
0,0,We sought to compare the length of stay (LOS) ...
1,0,"Thrombosis after cessation of anticoagulation,..."
2,0,
3,0,To compare hospital length of stay (LOS) and h...
4,0,"We sought to compare length-of-stay (LOS), tot..."
5,0,Due to limited evidence on the impact of rivar...
6,0,Pulmonary embolism (PE) is a life-threatening ...
7,0,Pulmonary embolism remains one of the leading ...
8,0,"In the EINSTEIN-Pulmonary Embolism (PE) trial,..."
9,0,Pulmonary embolism (PE) is a potentially life-...


In [59]:
df = pubmed_conclusions(df)

index: 0, INN: rivaroxaban
results: 524


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


no abstract
no abstract
# of abstracts: 172

0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [60]:
df

Unnamed: 0,INN,Therapeutic area,First published,n_trials,status_not_yet_recruiting,status_recruiting,status_enrolling_by_invitation,status_active_not_recruiting,status_suspended,status_terminated,...,org_other,org_other_gov,phase_early_1,phase_not_applicable,phase_1,phase_2,phase_3,phase_4,pm_results,conclusions
0,rivaroxaban,"pulmonary embolism, TVT",2018-01-01,118,8,24,1,9,1,4,...,72,2,0,10,1,15,41,14,524,In analyses restricted to low-risk patients (...


In [63]:
from utils.clinical_trials_pubmed import ct_pm

In [64]:
df = pd.DataFrame.from_dict({"INN": "rivaroxaban",
                  "Therapeutic area": "pulmonary embolism, TVT",
                  "First published": "2018-01-01"},
                           orient="index").T

In [66]:
df = ct_pm(df)

fetching trial data for 0, rivaroxaban


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['n_trials'][index] = data['n_trials']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['status_not_yet_recruiting'][index] = data['status']['Not yet recruiting']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['status_recruiting'][index] = data['status']['Recruiting']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['status_enrolling_by_invitation'][index] = data['sta

index: 0, INN: rivaroxaban
results: 524


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['pm_results'][index] = n_ids


no abstract
no abstract
# of abstracts: 172

0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['conclusions'][index] = conclusions


In [67]:
df

Unnamed: 0,INN,Therapeutic area,First published,n_trials,status_not_yet_recruiting,status_recruiting,status_enrolling_by_invitation,status_active_not_recruiting,status_suspended,status_terminated,...,org_other,org_other_gov,phase_early_1,phase_not_applicable,phase_1,phase_2,phase_3,phase_4,pm_results,conclusions
0,rivaroxaban,"pulmonary embolism, TVT",2018-01-01,118,8,24,1,9,1,4,...,72,2,0,10,1,15,41,14,524,In analyses restricted to low-risk patients (...
