In [42]:
# import libraries and packages
import pandas as pd
import requests
from io import StringIO
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from xgboost import XGBClassifier



In [31]:
# import additional requirements
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\marvi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Prep Grouped Data ##

In [32]:
# Define GitHub URL
url = 'https://raw.githubusercontent.com/mmoran90/ADS-509-Text-Mining/main/combined_df.csv'

# Get CSV content from GitHub
response = requests.get(url)
if response.status_code == 200:
    data = StringIO(response.text)
    maude_df = pd.read_csv(data)
    print(maude_df.head())
else:
    print(f"Error fetching data: {response.status_code}")

   MDRFOI ID                                Brand Name  \
0   20112651  HI-TORQUE BALANCE MIDDLEWEIGHT UNIVERSAL   
1   20112634                          XIENCE SKYPOINT¿   
2   20112157                         TRICLIP G4 SYSTEM   
3   20112156                         TRICLIP G4 SYSTEM   
4   20111290                       PERCLOSE¿ PROSTYLE¿   

                  Model Number  \
0        Model Number 1009664J   
1      Model Number 1804225-18   
2  Catalog Number TCDS0302-XTW   
3  Catalog Number TCDS0302-XTW   
4        Model Number 12773-02   

                                      Device Problem  \
0  Device Problems Difficult to Remove (1528); Ma...   
1  Device Problems Material Rupture (1546); Diffi...   
2        Device Problem Incomplete Coaptation (2507)   
3  Device Problems Positioning Failure (1158); En...   
4             Device Problem Failure to Cycle (1142)   

                                     Patient Problem             Event Date  \
0  Patient Problem Device Embe

In [23]:
# remove non-text columns irre
maude_df.drop(['MDRFOI ID', 'Model Number', 'Event Date'], axis=1, inplace=True)

In [35]:
# Combine relevant text columns
maude_df['text'] = maude_df[['Brand Name', 'Device Problem', 'Patient Problem', 'Manufacturer Narrative', 'Event Description']].fillna('').agg(' '.join, axis=1)

# Pre-process text and tokenize using nltk
def clean_and_tokenize_text(text):
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize text using nltk
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)  # Join tokens back into a string

maude_df['clean_text'] = maude_df['text'].apply(clean_and_tokenize_text)

In [36]:
# Target Variable ('Device Problem')
maude_df.dropna(subset=['Device Problem'], inplace=True)  # Drop rows with missing target

# Combine descriptive text and numbers into labels
maude_df['labels'] = maude_df['Device Problem'].apply(lambda x: [re.sub(r'Device Problems? |Device Problem |\(|\)', '', label).strip() for label in x.split(';')])

# Binarize labels (i.e., convert data into binary matrix for multi-label classification)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(maude_df['labels'])

#The fit_transform() converts these lists into a binary matrix, where each unique label becomes a column, and each row has a value of 1 or 0 
# indicating whether that label applies to that instance.

In [None]:
## Split Data ##

In [37]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(maude_df['clean_text'], y, test_size=0.3, random_state=42)

## Build SMV Classification Model ##

In [38]:
# Extract featurees using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [39]:
# Build SVM model (with OneVsRestClassifier for multi-label classification)
svm_model = OneVsRestClassifier(SVC(kernel='linear', random_state=42))
svm_model.fit(X_train_tfidf, y_train)



In [40]:
# Predict on test set
y_pred_svm = svm_model.predict(X_test_tfidf)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred_svm)
classification_rep = classification_report(y_test, y_pred_svm, target_names=mlb.classes_)

print(f"Accuracy: {accuracy}\n")
print("Classification Report:\n", classification_rep)

Accuracy: 0.7080536912751678

Classification Report:
                                                              precision    recall  f1-score   support

                                    Activation Failure 3270       0.00      0.00      0.00         6
Adverse Event Without Identified Device or Use Problem 2993       0.94      0.89      0.92        19
                                                 Break 1069       1.00      0.25      0.40         8
                 Communication or Transmission Problem 2896       0.00      0.00      0.00         1
                                     Component Missing 2306       0.00      0.00      0.00         0
                Contamination /Decontamination Problem 2895       0.00      0.00      0.00         1
                                                 Crack 1135       0.00      0.00      0.00         0
                                     Deflation Problem 1149       0.00      0.00      0.00         0
                 Deformation Due to 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Build XGBoost Classification Model ##

In [44]:
# Build XGBoost model with OneVsRestClassifier for multi-label classification
xgb_model = OneVsRestClassifier(XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss', random_state=42))
xgb_model.fit(X_train_tfidf, y_train)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [45]:
# Prediction on test set
y_pred_xgb = xgb_model.predict(X_test_tfidf)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred_xgb)
classification_rep = classification_report(y_test, y_pred_xgb, target_names=mlb.classes_)

print(f"Accuracy: {accuracy}\n")
print("Classification Report:\n", classification_rep)

Accuracy: 0.8523489932885906

Classification Report:
                                                              precision    recall  f1-score   support

                                    Activation Failure 3270       1.00      0.83      0.91         6
Adverse Event Without Identified Device or Use Problem 2993       1.00      1.00      1.00        19
                                                 Break 1069       1.00      0.88      0.93         8
                 Communication or Transmission Problem 2896       0.00      0.00      0.00         1
                                     Component Missing 2306       0.00      0.00      0.00         0
                Contamination /Decontamination Problem 2895       0.00      0.00      0.00         1
                                                 Crack 1135       0.00      0.00      0.00         0
                                     Deflation Problem 1149       0.00      0.00      0.00         0
                 Deformation Due to 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Marvin's comments:
# Class imbalance is a concern for our data set, given that not all reported problems will have all possible device problems.
# Given this, the SVM model performed poorly compared to the XGBoost model. This was expected as XGBoost is the model that
# better handles class imbalances anyway. And, SVM also struggles with multi-class.

# key performance metric to look at here are:
# 1. support = values closer to zero indicate a given label appears very few times in the dataset.
# 2. weighted average = ndicates the overall performance across all classes.

# Next steps --> improve with XGBoost by tuning hyperparameters! Forget SVM :/