# Analysis of clinical trials for hepatocellular carcinoma

In [3]:
import requests
import json
import numpy as np
from bs4 import BeautifulSoup
import lxml
import pandas as pd
import xml.etree.ElementTree as et
from os import listdir
from os.path import isfile, join

## XML clinical trial data

In [4]:
# Use Clinicaltrials.gov API to download trial information in XML format
# https://clinicaltrials.gov/ct2/results/download_fields?cond=hepatocellular+carcinoma&down_count=1000&down_fmt=xml

In [12]:
# Use ElementTree
tree = et.parse('data/ctgov_hcc_results/NCT00003147.xml')
root = tree.getroot()

In [6]:
# Extract facility information from XML. Add to dictionary of facilities with address information

trial_file = 'data/ctgov_hcc_results/NCT00003147.xml'

def get_facilities(trial_file):
    
    tree = et.parse(trial_file)
    root = tree.getroot()

    facility_dict = {}
    
    for i, facility in enumerate(root.findall('./location/facility')):
        
        address_tree = facility.find('address')
        
        # add elements to dictionary
        address_dict = {}
        
        if address_tree.find('city') != None:
            city = address_tree.find('city').text
            address_dict['city'] = city
        if address_tree.find('state') != None:
            state = address_tree.find('state').text
            address_dict['state'] = state
        if address_tree.find('zip') != None:
            zipcode = address_tree.find('zip').text
            address_dict['zip'] = zipcode
        if address_tree.find('country') != None:
            country = address_tree.find('country').text
            address_dict['country'] = country
        
        if facility.find('name') != None:
            name = facility.find('name').text
            facility_dict[name] = address_dict # add to facility dictionary
        else:
            facility_dict['unnamedFacility_' + str(i)] = address_dict # add to facility dictionary
    
    return facility_dict

In [7]:
# Extract relevant trial information from XML file and input into dictionary

trial_file = 'data/ctgov_hcc_results/NCT00003147.xml'

def extract_trial_info(trial_file):
    
    with open(trial_file) as fp:
        soup = BeautifulSoup(fp, "xml")
    
    trial_dict = {}
    
    trial_name = soup.nct_id.contents[0]
    if soup.nct_id != None:
        trial_dict['nct_id'] = soup.nct_id.contents[0]
    if soup.brief_title != None:
        trial_dict['brief_title'] = soup.brief_title.contents[0]
    if soup.official_title != None:
        trial_dict['official_title'] = soup.official_title.contents[0]
    if soup.phase == True:
        trial_dict['phase'] = soup.phase.contents[0]
    if soup.overall_status != None:
        trial_dict['status']= soup.overall_status.contents[0]
    if soup.detailed_description != None:
        trial_dict['description']= soup.detailed_description.textblock.contents[0]
    if soup.eligibility:
        if soup.eligibility.criteria != None:
            trial_dict['criteria']= soup.eligibility.criteria.textblock.contents[0]
        if soup.eligibility.gender:
            trial_dict['gender']= soup.eligibility.gender.contents[0]
        if soup.eligibility.minimum_age != None:
            trial_dict['min_age']= soup.eligibility.minimum_age.contents[0]
        if soup.eligibility.maximum_age != None:
            trial_dict['max_age']= soup.eligibility.maximum_age.contents[0]
    if soup.study_type != None:
        trial_dict['study_type']= soup.study_type.contents[0]
    if soup.brief_summary != None:
        trial_dict['summary']= soup.brief_summary.textblock.contents[0]
    trial_dict['facilities'] = get_facilities(trial_file)
    
    return trial_name, trial_dict

In [8]:
# Extract clinical trial data from multiple XML files in folder

def collect_trial_data(path, num_files=100):

    files = listdir(path=path)
    trials = {}
    
    i = 0
    for file in files:
        if i <= num_files:
        
            trial_file = path + file
            trial_name, trial_dict = extract_trial_info(trial_file)
            trials[trial_name] = trial_dict 
        i += 1
    
    return trials

In [9]:
path_folder = 'data/ctgov_hcc_results/'

trials = collect_trial_data(path_folder, num_files=100)

In [10]:
trials_df = pd.DataFrame(trials).transpose()
trials_df.head()

Unnamed: 0,brief_title,criteria,description,facilities,gender,max_age,min_age,nct_id,official_title,status,study_type,summary
NCT00004108,DX-8951f in Treating Patients With Liver Cancer,DISEASE CHARACTERISTICS: Histological...,OBJECTIVES: I. Evaluate the antitumor a...,{'University of Colorado Cancer Center': {'cit...,All,,16 Years,NCT00004108,A Phase II Study of Intravenous DX-8951f Admin...,Completed,Interventional,RATIONALE: Drugs used in chemotherapy u...
NCT00005997,Rebeccamycin Analogue in Treating Patients Wit...,DISEASE CHARACTERISTICS:  -...,OBJECTIVES:  - Determine the r...,{'Comprehensive Cancer Center at University of...,All,,18 Years,NCT00005997,Phase II and Pharmacokinetic Trial of Rebeccam...,Terminated,Interventional,RATIONALE: Drugs used in chemotherapy u...
NCT00006332,Treatment of Hepatocellular Carcinoma With Tet...,Inclusion Criteria:  - Pat...,,"{'3912 Taubman Center': {'city': 'Ann Arbor', ...",All,,18 Years,NCT00006332,,Completed,Interventional,Hepatocellular carcinoma (HCC) is a dea...
NCT00047346,Erlotinib in Treating Patients With Unresectab...,Inclusion Criteria:  - His...,PRIMARY OBJECTIVES:  I. Establish...,{'M D Anderson Cancer Center': {'city': 'Houst...,All,,18 Years,NCT00047346,"A Dose-Finding, Safety, And Pharmacokinetic St...",Completed,Interventional,Phase I trial to study the effectivenes...
NCT00057395,A Safety and Effectiveness Study of Aroplatin ...,Inclusion Criteria:  - Adv...,Primary Objective:  - Determin...,{'John Wayne Cancer Institute': {'city': 'Sant...,All,,18 Years,NCT00057395,A Phase I/II Study of Aroplatin™ in Patients W...,Unknown status,Interventional,To determine the rate of response and t...


In [11]:
# clean dataframe
trials_df['min_age'] = trials_df['min_age'].str.replace('Years', '')
trials_df['min_age'] = trials_df['min_age'].str.replace(' ', '')
trials_df['min_age'].loc[trials_df['min_age'] == 'N/A'] = 0
trials_df['min_age'] = trials_df['min_age'].apply(int)

trials_df['max_age'] = trials_df['max_age'].str.replace('Years', '')
trials_df['max_age'] = trials_df['max_age'].str.replace(' ', '')
trials_df.loc[trials_df['max_age'] == 'N/A', 'max_age'] = 100
trials_df.loc[trials_df['max_age'] == '10Days', 'max_age'] = 1
trials_df['max_age'] = trials_df['max_age'].apply(int)

trials_df['brief_title'] = trials_df['brief_title'].apply(str)
trials_df['official_title'] = trials_df['official_title'].apply(str)
trials_df['description'] = trials_df['description'].apply(str)
trials_df['summary'] = trials_df['summary'].apply(str)

In [13]:
# Create column with all the text from brief_title, official_title, and summary
trials_df['text'] = trials_df['brief_title'] + ' ' + \
                    trials_df['official_title'] + ' ' + \
                    trials_df['description'] + ' ' + \
                    trials_df['summary']

In [14]:
# Function to format and tokenize data
def trial_tokenizer(description):
    description = str(description)
    description = description.replace('\n', '').replace('\t', '')
    for i in range(3):
        description = description.replace('  ', ' ')
    description = description.lower()
    
    return description

In [15]:
text = trials_df.text.apply(trial_tokenizer)

## NLP, matrix decomposition and clustering analysis of clinical trials

In [16]:
from numpy.linalg import lstsq
from numpy.linalg import norm
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import word_tokenize, sent_tokenize
from sklearn.decomposition import NMF, TruncatedSVD

### NLP of trial descriptions 

In [105]:
tf = TfidfVectorizer(max_features=5000, stop_words='english')

doc_term_mat = tf.fit_transform(text.values)

In [106]:
terms = np.array(tf.get_feature_names())

In [107]:
terms

array(['000', '03', '04', ..., 'yttrium90', 'zd1839', 'μg'], 
      dtype='<U20')

In [108]:
sklearn_nmf = NMF(max_iter=50, n_components=15)

W = sklearn_nmf.fit_transform(doc_term_mat)
H = sklearn_nmf.components_

In [109]:
for i, row in enumerate(H):
    print("Topic", i, "Most Common Words:")
    print("----------------------------------")
    top = terms[np.argsort(row)[::-1][:20]]
    for t in top:
        print(t)
    print("\n")

Topic 0 Most Common Words:
----------------------------------
rfa
hcc
ablation
radiofrequency
recurrent
hepatectomy
patients
recurrence
cirrhosis
pht
ls
treatment
outcomes
rate
considered
hr
prognosis
assisted
small
systemic


Topic 1 Most Common Words:
----------------------------------
oxaliplatin
plus
leucovorin
fluorouracil
gemcitabine
folfox4
advanced
liposomal
doxorubicin
chemotherapy
versus
oxa
ld
gemox
hcc
haic
gem
prolonging
pfs
superior


Topic 2 Most Common Words:
----------------------------------
patients
arsenic
trioxide
tumor
treating
dx
ii
days
ihc
progression
dose
rebeccamycin
8951f
growth
toxicity
disease
receive
months
cancer
liver


Topic 3 Most Common Words:
----------------------------------
tace
chemoembolization
transarterial
intermediate
patients
beads
hcc
deb
cyberknife
embolization
eluting
stage
tate
carcinoma
hepatocellular
efficacy
treatment
100
doxorubicin
sbrt


Topic 4 Most Common Words:
----------------------------------
therasphere
treatment
glass
micr

In [123]:
titles = trials_df['brief_title'].values

for i in range(W.shape[1]):
    topic_arr = W[:,i]
    print("Topic", i, "Most Relevant Article Titles:")
    print("----------------------------------")
    top = titles[np.argsort(topic_arr)[::-1][:20]]
    for t in top:
        print(t)
    print("\n")

Topic 0 Most Relevant Article Titles:
----------------------------------
RFA Combined With Oxaliplatin + 5-FluoroUracil/LeucoVorin (5-FU/LV) (FOLFOX4) for Recurrent HCC
Radiofrequency Ablation Accompanied With Spontaneous Sorafenib in Early to Intermediate Stage HCC
Laparoscopic Surgery VS RFA for Recurrent HCC
Radiofrequency-assisted Hepatectomy on the Outcomes of HCC Patients With Cirrhosis
Radiofrequency Ablation in Treating Patients With Liver Cancer and Cirrhosis
HR Versus RFA for HCC in Patients With PHT
Hepatic Resection Versus TACE+RFA for BCLC Stage B Hepatocellular Carcinoma
Clinical Intervention Modelling, Planning and Proof for Ablation Cancer Treatment
Comparison Study of Sorafenib and 5-fluorouracil/Mitomycin for Metastatic Hepatocellular Carcinoma
Single Session Combined Locoregional Therapies for Hepatocellular Carcinoma
Dynamic Contrast-enhanced Magnetic Resonance Imaging in Evaluation of Liver Functional Status and Treatment Efficacy in Patients With Hepatocellular Ca

In [118]:
sklearn_svd = TruncatedSVD(n_components=15)
test_s = sklearn_svd.fit_transform(doc_term_mat)
test_d = sklearn_svd.components_

In [120]:
for i, row in enumerate(test_d):
    print("Topic", i, "Most Common Words:")
    print("----------------------------------")
    top = terms[np.argsort(row)[::-1][:20]]
    for t in top:
        print(t)
    print("\n")

Topic 0 Most Common Words:
----------------------------------
patients
hcc
sorafenib
treatment
study
liver
tumor
hepatocellular
carcinoma
tace
advanced
chemoembolization
cancer
therapy
rfa
plus
survival
arterial
safety
phase


Topic 1 Most Common Words:
----------------------------------
plus
sorafenib
oxaliplatin
advanced
bsc
leucovorin
fluorouracil
gemcitabine
folfox4
supportive
placebo
best
haic
versus
care
liposomal
hepatocellular
carcinoma
rad001
pembrolizumab


Topic 2 Most Common Words:
----------------------------------
hcc
rfa
tace
hepatectomy
resection
recurrence
recurrent
ablation
margin
chemoembolization
radiofrequency
laparoscopic
narrow
prospective
cm
stage
adjuvant
intermediate
transarterial
versus


Topic 3 Most Common Words:
----------------------------------
chemoembolization
tace
sorafenib
arterial
transcatheter
transarterial
combined
intermediate
hepatocellular
anti
hbv
adjuvant
kmg
microsphere
stage
virus
carcinoma
combination
therapy
unresectable


Topic 4 Most Co

### NLP of trial eligibility criteria

In [28]:
import re

In [22]:
trials_df.columns

Index(['brief_title', 'criteria', 'description', 'facilities', 'gender',
       'max_age', 'min_age', 'nct_id', 'official_title', 'status',
       'study_type', 'summary', 'text'],
      dtype='object')

In [26]:
test = trials_df.criteria[4]

In [27]:
test

'\n        Inclusion Criteria:\n\n          -  Advanced solid malignancies;\n\n          -  Amenable to therapy with DACH platinum agents;\n\n          -  Measurable disease (RECIST criteria);\n\n          -  ECOG performance score of 0-2;\n\n          -  Adequate hematopoietic, liver and renal function;\n\n          -  Adequate cardiac function (maximum of class II, NYHA);\n\n          -  Women of childbearing potential must have a negative urine or serum pregnancy test;\n\n          -  Signed written informed consent;\n\n          -  Subjects must be willing to be followed during the course of treatment/observation and\n             follow-up.\n\n        Exclusion Criteria:\n\n          -  No other active malignancies;\n\n          -  No prior therapy with oxaliplatin;\n\n          -  No known brain metastases;\n\n          -  Active, uncontrolled infection or other serious medical illnesses;\n\n          -  Not using or have used any investigational therapy during four weeks before 

In [34]:
test.split('Exclusion Criteria:')[1]

'\n\n          -  No other active malignancies;\n\n          -  No prior therapy with oxaliplatin;\n\n          -  No known brain metastases;\n\n          -  Active, uncontrolled infection or other serious medical illnesses;\n\n          -  Not using or have used any investigational therapy during four weeks before start of\n             protocol treatment.\n      '

In [None]:
(?:world)(.*)

In [None]:
# Create column with all the text from brief_title, official_title, and summary
trials_df['text'] = trials_df['brief_title'] + ' ' + \
                    trials_df['official_title'] + ' ' + \
                    trials_df['description'] + ' ' + \
                    trials_df['summary']