# Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
from nltk.corpus import stopwords
import re
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer 
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier
# Performance metric
from sklearn.metrics import f1_score

%matplotlib inline
pd.set_option('display.max_colwidth', 300)

# Load Data

In [2]:
df_final = pd.read_csv('C:/Users/melin/Documents/Springboard Data Science Career Track/Capstone Projects/Capstone Project 1/df_final.csv')
df_final.head()

Unnamed: 0,device_id,device_name,device_country,event_id,action_classification,event_country,reason,type,clean_reason
0,861.0,Invacare Action 3 Junior Push Bar Handle,NZL,957.0,,NZL,The failure of the finger bolts can lead to injuries if control of the chair is lost.,Recall,failur finger bolt lead injuri control chair lost
1,862.0,Bio-Rad D-10 Hemoglobin A Program Reorder pack,NZL,958.0,,NZL,Medical Device Correction iniitiated as a result of of customer complaints investigation for anothe product regarding calibration failure and late retention times associated with Elution Buffers. The company examined the potential for a similar problem with the above buffer lots. Bio-Rad request...,Recall,medic devic correct iniiti result custom complaint investig anoth product regard calibr failur late retent time associ elut buffer compani examin potenti similar problem abov buffer lot bio rad request custom discard affect product
2,863.0,"Bio-Rad VARIANT 11 beta Thalassemmia Short program Reorder pack, 500 tests",NZL,959.0,,NZL,Medical Device Correction iniitiated as a result of of customer complaints investigation for anothe product regarding calibration failure and late retention times associated with Elution Buffers. The company examined the potential for a similar problem with the above buffer lots. Bio-Rad request...,Recall,medic devic correct iniiti result custom complaint investig anoth product regard calibr failur late retent time associ elut buffer compani examin potenti similar problem abov buffer lot bio rad request custom discard affect product
3,1062.0,Bio-Rad IH-Com Kit Fullversion (Data management and result Interpretation),NZL,1159.0,,NZL,"In case of non interpretable results in the anti-AB well (ABO3) of ID-cards intended for ABO forward grouping, the IH-Com software does not include the reaction of this well (result code-30) wheh sending the result to the laboratory Host. The issue can only occur if the following conditions are ...",Recall,case non interpret result anti ab well abo id card intend abo forward group ih com softwar doe includ reaction thi well result code wheh send result laboratori host issu onli occur follow condit met ih com softwar connect laboratori host astm commun mode abo interpret result obtain due reaction ...
4,1088.0,Bio-Rad Urinary Catecholamines by HPLC Reagent Kit,NZL,1185.0,,NZL,"In some urine samples the concentration of adrenaline/epinephrine may decrease when analysis time is >8 hours., This decrease in concentration occurs only, if samples were subsequently adjusted to pH 5 to pH 7 as described in the instruction for use for very acidic samples or sampl...",Recall,urin sampl concentr adrenalin epinephrin may decreas analysi time hour thi decreas concentr occur onli sampl subsequ adjust ph ph describ instruct use veri acid sampl sampl low volum risk fals neg result report sampl experi condit


# Converting Text to Features

In [3]:
#Remove rows containing missing values under the clean_reason column 
df_final = df_final[df_final['clean_reason'].notnull()]

#Changing 'Safety alert' value in type column into 'Safety Alert'
df_final[['type']] = df_final[['type']].replace(dict.fromkeys(['Safety alert'], 'Safety Alert'))

In [4]:
#Preparing data for k-fold cross-validation
data = df_final[['clean_reason', 'type']].copy()
data = data.rename(columns={'clean_reason': 'text', 'type': 'class'}) #rename the columns in data
data = data.reindex(np.random.permutation(data.index))

#Remove rows containing missing values under the text column 
data = data[data['text'].notnull()]

#Create a pipeline merging the feature extraction and the naïve Bayes classifier into one operation
pipeline = Pipeline([
  ('vectorizer',  CountVectorizer(ngram_range=(1, 2))), #extract more features from the text by implementing bigram (2-grams) counts
  ('classifier',  MultinomialNB())])

#Performing k-fold cross-validation
k_fold = KFold(n_splits=10, shuffle=True, random_state=1) #set 10 folds to the cross-validation
scores = []
for train_indices, test_indices in k_fold.split(data):
  train_text = np.asarray(data.iloc[train_indices]['text'])
  train_y    = np.asarray(data.iloc[train_indices]['class'])

  test_text  = np.asarray(data.iloc[test_indices]['text'])
  test_y     = np.asarray(data.iloc[test_indices]['class'])

  pipeline.fit(train_text, train_y)
  score = pipeline.score(test_text, test_y)
  scores.append(score)

score = sum(scores) / len(scores)

In [5]:
#Check the mean accuracy score of the k-fold cross-validation
score

0.9488685320861693

In [6]:
#Using TF-IDF to extract features from the cleaned version of the reason data
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000) # set 10,000 most frequent words in the data as features

In [7]:
#Create TF-IDF features
xtrain_tfidf = tfidf_vectorizer.fit_transform(train_text)
xval_tfidf = tfidf_vectorizer.transform(test_text)

In [8]:
##Format train_y and test_y for input into sklearn’s MultiLabelBinarizer( )

#Convert train_y and test_y into datasets
df_train_y=pd.DataFrame(train_y, columns=['new_type']) 
df_test_y=pd.DataFrame(test_y, columns=['new_type'])

#Converting values in df_train_y and df_test_y into lists with ',' replacing '/'
df_train_y['new_type'] = df_train_y['new_type'].str.split(' / ') 
df_test_y['new_type'] = df_test_y['new_type'].str.split(' / ') 

#Suppress warning in this code block
with warnings.catch_warnings(): #whatever warning methods are run within this block, undo them when exiting the block
    warnings.simplefilter('ignore')
    print(df_train_y.new_type.value_counts())
    print(df_test_y.new_type.value_counts())

[Recall]                  51271
[Safety Alert]             3440
[Field Safety Notice]       880
[Recall, Safety Alert]       13
Name: new_type, dtype: int64
[Recall]                  5684
[Safety Alert]             381
[Field Safety Notice]      112
[Recall, Safety Alert]       1
Name: new_type, dtype: int64


In [9]:
#One hot encode the target variable, i.e., type by using sklearn’s MultiLabelBinarizer( )
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_train_y['new_type'])
multilabel_binarizer.fit(df_test_y['new_type'])

MultiLabelBinarizer(classes=None, sparse_output=False)

In [10]:
multilabel_binarizer.classes_

array(['Field Safety Notice', 'Recall', 'Safety Alert'], dtype=object)

In [11]:
# transform target variable
ytrain = multilabel_binarizer.transform(df_train_y['new_type'])
yval = multilabel_binarizer.transform(df_test_y['new_type'])

# Build Event Type Prediction Model

In [12]:
#Use sklearn’s OneVsRestClassifier class to solve the Logistic Regression model's problem as a Binary Relevance or one-vs-all problem
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

In [13]:
#Fit model on train data
clf.fit(xtrain_tfidf, ytrain)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [14]:
#Make predictions for validation set
y_pred = clf.predict(xval_tfidf)

In [15]:
#Check out a sample from these predictions
y_pred[3]

array([0, 1, 0])

In [16]:
#Convert the predicted arrays into event type tags
multilabel_binarizer.inverse_transform(y_pred)[3]

('Recall',)

In [17]:
# evaluate performance
f1_score(yval, y_pred, average="micro")

0.9555069292487235

# Create Inference Function

In [18]:
#Create new_type column in df_final dataset
df_final['new_type'] = df_final['type'].str.split(' / ')

#Suppress warning in this code block
with warnings.catch_warnings(): #whatever warning methods are run within this block, undo them when exiting the block
    warnings.simplefilter('ignore')
    print(df_final.new_type.value_counts())

[Recall]                  56955
[Safety Alert]             3821
[Field Safety Notice]       992
[Recall, Safety Alert]       14
Name: new_type, dtype: int64


In [19]:
#Load text-cleaning functions

# function for text cleaning 
def clean_text(text): 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text

# function for stemming words
def stem_text(text):
    ps = PorterStemmer()
    token_words=word_tokenize(str(text))
    token_words
    stem_text=[]
    for word in token_words:
        stem_text.append(ps.stem(word))
        stem_text.append(" ")
    return "".join(stem_text)

# function to remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    no_stopword_text = [w for w in str(text).split() if not w in stop_words]
    return ' '.join(no_stopword_text)

In [20]:
#Create inference function
def infer_event(q):
    q = clean_text(q)
    q = stem_text(q)
    q = remove_stopwords(q)
    q_vec = tfidf_vectorizer.transform([q])
    q_pred = clf.predict(q_vec)
    return multilabel_binarizer.inverse_transform(q_pred)

In [21]:
for i in range(10): 
  k = df_final['reason'].sample(1).index[0] 
  print("Medical Device: ", df_final['device_name'][k], "\nPredicted Event Type: ", infer_event(df_final['reason'][k])), print("Actual Event Type: ",df_final['new_type'][k], "\n")

Medical Device:  ARCHITECT SYSTEM - PROGESTERONE ASSAY 
Predicted Event Type:  [('Recall',)]
Actual Event Type:  ['Recall'] 

Medical Device:  Single Shot Epidural Anesthesia Kit, Internal Jugular Puncture Kit with Blue FlexTip(R) Catheter, Pediatric Jugular Puncture Kit, Arterial Line Kit, Vessel Catheterization kit, Central Venous Catheterization kit, Jugular Puncture Ks. jne. ks.ilm., 
Predicted Event Type:  [('Recall',)]
Actual Event Type:  ['Recall'] 

Medical Device:  Device Recall  Cytosponge Cell Collection Device 
Predicted Event Type:  [('Recall',)]
Actual Event Type:  ['Recall'] 

Medical Device:  Device Recall  Howell D.A.S.H. Extraction Balloon 
Predicted Event Type:  [('Recall',)]
Actual Event Type:  ['Recall'] 

Medical Device:  CHECKCELLS (POOLED CELLS) 
Predicted Event Type:  [('Recall',)]
Actual Event Type:  ['Recall'] 

Medical Device:  RESONATE, VIGILANT X4, PERCIVA, and MOMENTUM CRT-D/ICD 
Predicted Event Type:  [('Safety Alert',)]
Actual Event Type:  ['Safety Aler