# Initial setup

Let's import the required libraries and set up global variables for the rest of the script.

In [1]:
# coding: utf-8

import csv
import os
import re
import shutil
import string
import tempfile
import zipfile
from collections import defaultdict
from lxml import objectify

import nltk
import pandas as pd
import requests

In [2]:
# Directory to hold the downloaded data and the serialized Pandas dataframes. 
# We'll create a new temporary directory here.
#data_dir = tempfile.mkdtemp()
#data_dir='C:\Study\CS102\project\data-dir'
data_dir='/Users/Lo/Work/cs109project-data'
print("Working directory: %s" % data_dir)

Working directory: /Users/Lo/Work/cs109project-data


# Data download

Download CSV data from clinicaltrials.gov. The data will be written in the working directory specified above as  [data_dir]/study_fields.csv.

For clinicaltrials.gov, a search term needs to be specified. In this example, we'll download search results for the term "seizure".

In [None]:
def download_ctgov(dest_dir, search_term):
    print("Downloading clinicaltrials.gov results for '%s' to %s" % (search_term, dest_dir))
    dl_url = "https://clinicaltrials.gov/ct2/results/download?down_stds=all&down_typ=results&down_flds=all&down_fmt=xml&term=%s&show_down=Y" % search_term

    # Download the zipped data and extract it to the output directory
    out_path = os.path.join(dest_dir, "download_ctgov.zip")
    with open(out_path, 'wb') as fh:
        r = requests.get(dl_url)
        for block in r.iter_content(1024):
            fh.write(block)
    with zipfile.ZipFile(out_path, 'r') as z:
        z.extractall(dest_dir)

In [None]:
#download_ctgov(data_dir, "seizure")

# Pandas import

Convert the downloaded CSV data to Pandas dataframes and serialize them as Python pickles. The function reads XML files from the working directory and writes to "ctgov.pckl"

In [None]:
def ctgov_to_dataframe(src_dir):
    # Get all XML files in the data directory
    print("Transforming cliniclatrials download (%s) to dataframe" % (src_dir))
    data = []
    for f in [_ for _ in os.listdir(src_dir) if _.endswith('.xml')]:
        xml = objectify.parse(os.path.join(src_dir, f))
        root = xml.getroot()
        d = defaultdict(list)
        for t in root.iter():
            if t.text:
                key = re.sub(r'\[\d+\]', '', xml.getpath(t)).replace('/clinical_study/', '').replace('/', '.')
                val = t.text.strip()
                d[key].append(val)
        d = {k: v[0] if len(v) == 1 else v for k, v in d.items()}
        data.append(d)
    data_frame = pd.DataFrame(data)
    return data_frame

# Writing dataframes
Transform the downloaded data to Pandas dataframes and seialize them as Python pickles.

In [None]:
ct_df = ctgov_to_dataframe(data_dir)
ct_df.to_pickle(os.path.join(data_dir, 'ctgov.pckl'))

# Reading dataframes

Read the pickled data back into Pandas and display the first 5 records. In this example, the pickled dataframe is serialized to "ctgov.pckl" in the working directory.

In [None]:
ctgov_data = pd.read_pickle(os.path.join(data_dir, 'ctgov.pckl'))
ctgov_data.head()

# Extract criteria

Read inthe serialized data from clinicaltrials.gov and extract inclusion/exlcusion criteria, one per row. Output a Series(id_info.nct_id, Criteria, Inclusion, TokenCount).

In [None]:
def __process_criteria(data):
    pat = r"^([\w\-]*\s*){0,5}%s criteria[\s\w\(\),]*:"
    inpat = re.compile(pat % 'inclusion', re.UNICODE)
    expat = re.compile(pat % 'exclusion', re.UNICODE)
    try:
        incl = True
        id = data[1]
        txt = [_.strip() for _ in unicode(data[2]).split(u'\n\n')] # ZL - modified, added unicode() around data[2]
        for l in txt:
            if re.match(inpat, l.lower()):
                incl = True
            elif re.match(expat, l.lower()):
                incl = False
            else:
                toks = nltk.word_tokenize(l)
                s = {'NctId': id, 'Criteria': unicode(l), 'Include': incl, 'Tokens': toks, 'TokenCount': len(toks)}
                yield s
    except Exception as e:
        print("Error processing row %s: %s" % (data[2], e))


def extract_criteria(data):
    print("Transforming data (extracting criteria)")
    transformed = [s for row in data[['id_info.nct_id', 'eligibility.criteria.textblock']].itertuples() for s in
                   __process_criteria(row)]
    df = pd.DataFrame(transformed)
    return df

Transform the data and write the result to a file. (You'll notice that the script logs an error for one row. This is expected and results from that row being a "NaN".)

In [None]:
# The data was read in in the previous step: ctgov_data = pd.read_pickle(os.path.join(data_dir, 'ctgov.pckl'))
criteria = extract_criteria(ctgov_data)
criteria.to_pickle(os.path.join(data_dir, 'ct_criteria.pckl'))

Read back the data and display a record selected by column value.

In [None]:
criteria.loc[criteria['NctId'] == 'NCT01373190']

# Tag, lemmatize, ngrammize

Processes the extracted criteria with the NLTK POS tagger and lemmatizer and generates ngrams of 1-3 words (note: while unigrams are technically duplicated as 'Tokens', it will be more convenient to allow this and keep them in one column with bigrams and trigrams). Preprocesses the tokens by removing special characters and punctuation. Lemmata and ngrams are lowercased.

In [3]:
def __lemmatise(lemmatizer, r):
    wn_tags = {'NN': nltk.corpus.wordnet.NOUN, 'JJ': nltk.corpus.wordnet.ADJ, 'VB': nltk.corpus.wordnet.VERB,
               'RB': nltk.corpus.wordnet.ADV}
    return [(t[0], lemmatizer.lemmatize(t[0].lower(), pos=wn_tags.get(t[1][:2], nltk.corpus.wordnet.NOUN)).lower()) for
            t in r]


def tag_and_stem(data):
    print("Transforming data (tagging and lemmatising)")
    series = []
    lemmatizer = nltk.stem.WordNetLemmatizer()
    punct = '[%s]*' % re.escape(string.punctuation)
    pat = re.compile(r"^(%(p)s[\w\d]+%(p)s)+$" % {'p': punct}, re.UNICODE)
    # Itertuples is 50% faster than df.apply()
    for row in data[['NctId', 'Tokens']].itertuples():
        id = row[1]
        toks = filter(lambda t: re.match(pat, t), row[2])
        tags = nltk.pos_tag(toks)
        lemmas = __lemmatise(lemmatizer, tags)
        ngrams = []
        for n in (1, 3):
            ngrams += list(nltk.ngrams([(lemma[1], tags[idx][1]) for idx, lemma in enumerate(lemmas)], n))
        s = {'NctId': id, 'Tokens': toks, 'Tags': tags, 'Lemmas': lemmas, 'Ngrams': ngrams}
        series.append(s)
    df = pd.DataFrame(series)
    return df

Read in the extracted criteria (by stored in "ct_criteria.pckl" in the previous step), tag, lemmatize and ngrammize the data and store it as "ct_tagged.pckl".

In [6]:
%%time
#criteria = pd.read_pickle(os.path.join(data_dir, 'ct_criteria.pckl'))
criteria.head(5)
tagged = tag_and_stem(criteria.head(5))
tagged
# takes about 2s to process 1 row in criteria


Transforming data (tagging and lemmatising)
CPU times: user 9.64 s, sys: 193 ms, total: 9.83 s
Wall time: 9.89 s


In [14]:
criteria = pd.read_pickle(os.path.join(data_dir, 'ct_criteria.pckl'))
# complete tag_and_stem(criteria) takes up to 10hrs, using top 500
tagged = tag_and_stem(criteria.head(200))
tagged.to_pickle(os.path.join(data_dir, 'ct_tagged.pckl'))

Transforming data (tagging and lemmatising)


In [15]:
tagged.head()

Unnamed: 0,Lemmas,NctId,Ngrams,Tags,Tokens
0,"[(History, history), (of, of), (uncontrolled, ...",NCT00001149,"[((history, NN),), ((of, IN),), ((uncontrolled...","[(History, NN), (of, IN), (uncontrolled, JJ), ...","[History, of, uncontrolled, seizures, at, the,..."
1,"[(Seizure, seizure), (frequency, frequency), (...",NCT00001149,"[((seizure, NN),), ((frequency, NN),), ((by, I...","[(Seizure, NN), (frequency, NN), (by, IN), (hi...","[Seizure, frequency, by, history, must, be, su..."
2,"[(Patients, patient), (of, of), (any, any), (a...",NCT00001149,"[((patient, NNS),), ((of, IN),), ((any, DT),),...","[(Patients, NNS), (of, IN), (any, DT), (age, N...","[Patients, of, any, age, may, be, accepted]"
3,"[(Patients, patient), (and, and), (parents, pa...",NCT00001149,"[((patient, NNS),), ((and, CC),), ((parent, NN...","[(Patients, NNS), (and, CC), (parents, NNS), (...","[Patients, and, parents, or, guardians, if, ap..."
4,"[(Patients, patient)]",NCT00001192,"[((patient, NNS),)]","[(Patients, NNS)]",[Patients]


# Filter criteria
Filters out criteria composed entirely of function words and stopwords. Strips ngrams composed entirely of stop words/tags from the ngram list. By default this function uses the NTLK stopword list and all PTB tags except nouns. Additional lists of stop words and stop tags can be supplied with keyword arguments ("stop_words", "stop_tags"). Returns a tuple of dataframes, (filtered_criteria, excluded_criteria).

(Note: this step generates a SettingWithCopyWarning. This is known and is a false positive.)

In [16]:
def __filter(values, idx, stops):
    return not set([t[idx] for t in values]) <= stops


def filter_criteria(data, user_stop_words=[], user_stop_tags=[]):
    print("Filtering criteria")
    default_stop_words = nltk.corpus.stopwords.words('english')
    default_stop_tags = ["$", "''", "(", ")", ",", "--", ".", ":", "CC", "CD", "DT",
                         "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD",
                         "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP",
                         "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ",
                         "WDT", "WP", "WP$", "WRB", "``"]
    print("Filtering stops")
    stop_words = set(default_stop_words + user_stop_words)
    stop_tags = set(default_stop_tags + user_stop_tags)
    excluded = pd.DataFrame()
    for col, idx, stops in (
            ('Lemmas', 0, stop_words),
            ('Tags', 1, stop_tags)):  # Lemma filtering excludes 18 rows, tag filtering excludes 205
        data['Ngrams'] = data['Ngrams'].apply(lambda row: [ngram for ngram in row if __filter(ngram, idx, stops)])
        groups = data.groupby(lambda r: __filter(data[col].loc[r], 1, stops))
        data = groups.get_group(True)
        excluded = excluded.append(groups.get_group(False)) if groups.groups.has_key(False) else excluded
    return (data, excluded)

Read in the tagged criteria (stored in "ct_tagged.pckl" in the previous step), filter out noise and write the results to "ct_filtered.pckl" (the included criteria) and "ct_excluded.pckl" (the excluded criteria).

In [17]:
%time
criteria = pd.read_pickle(os.path.join(data_dir, 'ct_tagged.pckl'))
incl, excl = filter_criteria(criteria)
incl.to_pickle(os.path.join(data_dir, 'ct_filtered.pckl'))
excl.to_pickle(os.path.join(data_dir, 'ct_excluded.pckl'))

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 9.06 µs
Filtering criteria
Filtering stops


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [18]:
incl.head()

Unnamed: 0,Lemmas,NctId,Ngrams,Tags,Tokens
0,"[(History, history), (of, of), (uncontrolled, ...",NCT00001149,"[((history, NN),), ((seizure, NNS),), ((time, ...","[(History, NN), (of, IN), (uncontrolled, JJ), ...","[History, of, uncontrolled, seizures, at, the,..."
1,"[(Seizure, seizure), (frequency, frequency), (...",NCT00001149,"[((seizure, NN),), ((frequency, NN),), ((histo...","[(Seizure, NN), (frequency, NN), (by, IN), (hi...","[Seizure, frequency, by, history, must, be, su..."
2,"[(Patients, patient), (of, of), (any, any), (a...",NCT00001149,"[((patient, NNS),), ((age, NN),), ((patient, N...","[(Patients, NNS), (of, IN), (any, DT), (age, N...","[Patients, of, any, age, may, be, accepted]"
3,"[(Patients, patient), (and, and), (parents, pa...",NCT00001149,"[((patient, NNS),), ((parent, NNS),), ((guardi...","[(Patients, NNS), (and, CC), (parents, NNS), (...","[Patients, and, parents, or, guardians, if, ap..."
4,"[(Patients, patient)]",NCT00001192,"[((patient, NNS),)]","[(Patients, NNS)]",[Patients]


In [19]:
incl.Ngrams[0]

[((u'history', 'NN'),),
 ((u'seizure', 'NNS'),),
 ((u'time', 'NN'),),
 ((u'month', 'NNS'),),
 ((u'year', 'NNS'),),
 ((u'pattern', 'NN'),),
 ((u'seizure', 'NNS'),),
 ((u'month', 'NNS'),),
 ((u'admission', 'NN'),),
 ((u'history', 'NN'), (u'of', 'IN'), (u'uncontrolled', 'JJ')),
 ((u'of', 'IN'), (u'uncontrolled', 'JJ'), (u'seizure', 'NNS')),
 ((u'uncontrolled', 'JJ'), (u'seizure', 'NNS'), (u'at', 'IN')),
 ((u'seizure', 'NNS'), (u'at', 'IN'), (u'the', 'DT')),
 ((u'the', 'DT'), (u'present', 'JJ'), (u'time', 'NN')),
 ((u'present', 'JJ'), (u'time', 'NN'), (u'at', 'IN')),
 ((u'time', 'NN'), (u'at', 'IN'), (u'least', 'JJS')),
 ((u'during', 'IN'), (u'6', 'CD'), (u'month', 'NNS')),
 ((u'6', 'CD'), (u'month', 'NNS'), (u'of', 'IN')),
 ((u'month', 'NNS'), (u'of', 'IN'), (u'the', 'DT')),
 ((u'past', 'JJ'), (u'3', 'CD'), (u'year', 'NNS')),
 ((u'3', 'CD'), (u'year', 'NNS'), (u'and', 'CC')),
 ((u'year', 'NNS'), (u'and', 'CC'), (u'preferably', 'RB')),
 ((u'a', 'DT'), (u'continuous', 'JJ'), (u'pattern', 'N

In [20]:
excl.head(200)

Unnamed: 0,Lemmas,NctId,Ngrams,Tags,Tokens
149,"[(Other, other)]",NCT00004399,[],"[(Other, JJ)]",[Other]
160,"[(Age, age), (16, 16), (to, to), (65, 65)]",NCT00004403,[],"[(Age, $), (16, CD), (to, TO), (65, CD)]","[Age, 16, to, 65]"


In [37]:
#medex input % output directory
medex_in = '/Users/Lo/Work/cs109project-data/input/'
medex_out = '/Users/Lo/Work/cs109project-data/output/'

In [134]:
#create input files for medex to work on
def add_medex_prep(df, medex_in):
    #create medex input files
    for (count, row) in enumerate(incl.Tokens):
        f = open(medex_in+str(count)+".txt", "w")
        for word in row:
            print >>f, word,
        print >>f
        f.close()

add_medex_prep(incl, medex_in)

#then execute medex externally
##java -Xmx1024m -cp lib/*:bin org.apache.medex.Main -i '/Users/Lo/Work/cs109project-data/input/' -o '/Users/Lo/Work/cs109project-data/output/'

In [146]:
import glob
#parse medex output results, add column, delete the generated txt files
def add_medex_column(df, medex_in, medex_out):
    
    #remove empty output files
    for dirpath, dirs, files in os.walk(medex_out):
        for file in files: 
            path = os.path.join(dirpath, file)
            print path
            if os.stat(path).st_size == 0:
                os.remove(path)
                
    #read raw data in txt files
    rowlist = []
    raw_drug_data = []
    os.chdir(medex_out)
    for file in glob.glob("*.txt"):
        rowlist.append(int(file[:-4])) # filename - row index in dataframe
        with open (medex_out+file, "r") as raw:
            raw_drug_data.append(raw.read())
            
    #parse raw drug data
    processed_drug_data = []
    for i in range(len(raw_drug_data)):
        processed = [re.sub(r'\[.+?\]\s*', '', s).replace('\n', '') for s in raw_drug_data[i].split('\n1')[0].split('|')[1:]]
        # medex identify refusal as a drug. Remove these results.
        # may need to look at other ones
        if "refusal" in [col.lower() for col in processed]:
            rowlist.remove(rowlist[i])
        else:
            processed_drug_data.append(processed)
            
    #create MedEx column and add parsed medex data to df
    df['MedEx']=''
    for (count, i) in enumerate(rowlist):
        df['MedEx'][i] = processed_drug_data[count]
    
    #delete all files under the input and output directories
    files = glob.glob(medex_out+'*') + glob.glob(medex_in+'*')
    for f in files:
        os.remove(f)
    
    return df
    

In [147]:
add_medex_column(incl, medex_in, medex_out)

/Users/Lo/Work/cs109project-data/output/108.txt
/Users/Lo/Work/cs109project-data/output/11.txt
/Users/Lo/Work/cs109project-data/output/120.txt
/Users/Lo/Work/cs109project-data/output/126.txt
/Users/Lo/Work/cs109project-data/output/128.txt
/Users/Lo/Work/cs109project-data/output/139.txt
/Users/Lo/Work/cs109project-data/output/14.txt
/Users/Lo/Work/cs109project-data/output/15.txt
/Users/Lo/Work/cs109project-data/output/150.txt
/Users/Lo/Work/cs109project-data/output/16.txt
/Users/Lo/Work/cs109project-data/output/170.txt
/Users/Lo/Work/cs109project-data/output/182.txt
/Users/Lo/Work/cs109project-data/output/189.txt
/Users/Lo/Work/cs109project-data/output/190.txt
/Users/Lo/Work/cs109project-data/output/29.txt
/Users/Lo/Work/cs109project-data/output/42.txt
/Users/Lo/Work/cs109project-data/output/47.txt
/Users/Lo/Work/cs109project-data/output/49.txt
/Users/Lo/Work/cs109project-data/output/52.txt
/Users/Lo/Work/cs109project-data/output/53.txt
/Users/Lo/Work/cs109project-data/output/62.txt
/Us

Unnamed: 0,Lemmas,NctId,Ngrams,Tags,Tokens,MedEx
0,"[(History, history), (of, of), (uncontrolled, ...",NCT00001149,"[((history, NN),), ((seizure, NNS),), ((time, ...","[(History, NN), (of, IN), (uncontrolled, JJ), ...","[History, of, uncontrolled, seizures, at, the,...",
1,"[(Seizure, seizure), (frequency, frequency), (...",NCT00001149,"[((seizure, NN),), ((frequency, NN),), ((histo...","[(Seizure, NN), (frequency, NN), (by, IN), (hi...","[Seizure, frequency, by, history, must, be, su...",
2,"[(Patients, patient), (of, of), (any, any), (a...",NCT00001149,"[((patient, NNS),), ((age, NN),), ((patient, N...","[(Patients, NNS), (of, IN), (any, DT), (age, N...","[Patients, of, any, age, may, be, accepted]",
3,"[(Patients, patient), (and, and), (parents, pa...",NCT00001149,"[((patient, NNS),), ((parent, NNS),), ((guardi...","[(Patients, NNS), (and, CC), (parents, NNS), (...","[Patients, and, parents, or, guardians, if, ap...",
4,"[(Patients, patient)]",NCT00001192,"[((patient, NNS),)]","[(Patients, NNS)]",[Patients],
5,"[(Normal, normal), (volunteers, volunteer)]",NCT00001192,"[((volunteer, NNS),)]","[(Normal, JJ), (volunteers, NNS)]","[Normal, volunteers]",
6,"[(Subjects, subject), (over, over), (95, 95), ...",NCT00001192,"[((subject, NNS),), ((year, NNS),), ((age, NN)...","[(Subjects, NNS), (over, IN), (95, CD), (years...","[Subjects, over, 95, years, of, age]",
7,"[(Subjects, subject), (with, with), (a, a), (h...",NCT00001192,"[((subject, NNS),), ((history, NN),), ((alcoho...","[(Subjects, NNS), (with, IN), (a, DT), (histor...","[Subjects, with, a, history, of, alcohol, or, ...",
8,"[(1, 1), (Patients, patient), (ages, age), (2,...",NCT00001205,"[((patient, NNS),), ((year, NNS),), ((therapy,...","[(1, CD), (Patients, NNS), (ages, VBZ), (2, CD...","[1, Patients, ages, 2, to, 75, years, will, be...","[corticosteroids, , , , , , , , , C0001617, 35..."
9,"[(2, 2), (Patients, patient), (with, with), (p...",NCT00001205,"[((patient, NNS),), ((proven, NN),), ((neurocy...","[(2, CD), (Patients, NNS), (with, IN), (proven...","[2, Patients, with, proven, or, likely, neuroc...",


In [154]:
add_medex_column(incl, medex_in, medex_out).to_pickle(os.path.join(data_dir, 'ct_medex.pckl'))