In [51]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from io import StringIO
import seaborn as sns
import tabpy_client

In [52]:
# Instructions to run this notebook:
# Change path to that of your local machine.
# Download the following datasets from fda.gov:
# 1) pmn96cur.txt is the "1996-current" dataset from https://www.fda.gov/MedicalDevices/ProductsandMedicalProcedures/DeviceApprovalsandClearances/510kClearances/ucm089428.htm
# 2) foiclass.txt is in "foiclass.zip" at https://www.fda.gov/MedicalDevices/DeviceRegulationandGuidance/Overview/ClassifyYourDevice/ucm051668.htm
path = '/Users/nandini/Desktop/dvhacks/'
devices = pd.read_csv('pmn96cur.txt', sep='\|', engine='python')
product_codes = pd.read_csv('foiclass.txt', sep='\|', engine='python')

In [53]:
# Data cleaning
product_codes = product_codes[['PRODUCTCODE', 'DEVICENAME']]
devices = devices[['REVIEWADVISECOMM', 'PRODUCTCODE']]
combined_df = pd.merge(devices, product_codes, on='PRODUCTCODE')
col = ['REVIEWADVISECOMM', 'DEVICENAME']
combined_df = combined_df[col]
combined_df.columns = ['REVIEWADVISECOMM', 'DEVICENAME']
combined_df['category_id'] = combined_df['REVIEWADVISECOMM'].factorize()[0]

In [54]:
# Vectorization of categorical data
category_id_df = combined_df[['REVIEWADVISECOMM', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'REVIEWADVISECOMM']].values)

In [55]:
# Ensure the dataset has unique product codes
df = combined_df.drop_duplicates(subset='DEVICENAME')
df.head()

Unnamed: 0,REVIEWADVISECOMM,DEVICENAME,category_id
0,AN,"Apparatus, Nitric Oxide Delivery",0
21,SU,"Prosthesis, Tracheal, Expandable",1
58,GU,"Stimulator,Peripheral Nerve,Non-Implanted,For ...",2
64,RA,"Coil, Magnetic Resonance, Specialty",3
492,OR,"Prosthesis, Elbow, Constrained, Cemented",4


In [56]:
# Run tf-idf
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

In [57]:
features = tfidf.fit_transform(df.DEVICENAME).toarray()

In [58]:
labels = df.category_id

In [59]:
features.shape 
# Each of 2796 product codes is represented by 9675 features
# Representing the tf-idf score for different unigrams and bigrams

(2796, 9675)

In [60]:
# Find terms most correlated with each advisory committee
N = 2
for REVIEWADVISECOMM, category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(REVIEWADVISECOMM))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# 'AN':
  . Most correlated unigrams:
. airway
. oxygen
  . Most correlated bigrams:
. patient interface
. analyzer gas
# 'CH':
  . Most correlated unigrams:
. method
. radioimmunoassay
  . Most correlated bigrams:
. cpk isoenzymes
. clinical use
# 'CV':
  . Most correlated unigrams:
. bypass
. cardiopulmonary
  . Most correlated bigrams:
. blood pressure
. cardiopulmonary bypass
# 'DE':
  . Most correlated unigrams:
. orthodontic
. dental
  . Most correlated bigrams:
. bone grafting
. root canal
# 'EN':
  . Most correlated unigrams:
. ent
. hearing
  . Most correlated bigrams:
. nose throat
. hearing aid
# 'GU':
  . Most correlated unigrams:
. endoscope
. dialysate
  . Most correlated bigrams:
. catheter hemodialysis
. accessories flexible
# 'HE':
  . Most correlated unigrams:
. cell
. hemoglobin
  . Most correlated bigrams:
. control hemoglobin
. occult blood
# 'HO':
  . Most correlated unigrams:
. examination
. infusion
  . Most correlated bigrams:
. examination glove
. pump infusio

In [61]:
# Linear SVC (most appropriate model as determined in committees.py)
X_train, X_test, y_train, y_test = train_test_split(df['DEVICENAME'], df['REVIEWADVISECOMM'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = LinearSVC().fit(X_train_tfidf, y_train)

In [None]:
# Function that takes in a medical keyword and outputs the most associated FDA committee
def DevicesDemo(keyword):
    committee = clf.predict(count_vect.transform([keyword]))
    return(committee)

In [None]:
# Example: the keyword "wearable" is most associated with the "CV" (Cardiovascular) FDA committee
print(DevicesDemo("wearable"))

In [None]:
# Connect to TabPy server using the client library
connection = tabpy_client.Client('http://localhost:9004/')

In [None]:
# Publish the DevicesDemo function to TabPy server so it can be used from Tableau
connection.deploy('DevicesDemo', DevicesDemo, 'Suggests an FDA Advisory Committee based on a medical keyword', override = True)