# Datathon: Metastatic Cancer Diagnosis Models

### Functions for Data Preprocessing and Transformation

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, ExtraTreesClassifier
from sklearn import metrics
from sklearn import svm
import preprocessor as p
import nltk
from nltk.corpus import stopwords
import re
import matplotlib.pyplot as plt
import numpy as np
import pickle as pkl
nltk.download('stopwords')
pd.options.mode.chained_assignment = None

def print_metrics(y_test, y_pred):
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)

    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')

def save_model(file_name, model):
    pkl.dump(model, open(file_name, 'wb'))

def load_model(file_name):
    return pkl.load(open(file_name, 'rb'))

def one_hot_encode(df, type):
    return pd.get_dummies(df, dtype=type)

def random_search_cv(estimator, random_grid):
    return RandomizedSearchCV(estimator=estimator, param_distributions = random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs =-1)

def clean_text(text):

    # Remove stop words because their frequencies are likely to be larger than more important words
    stop_words = set(stopwords.words('english'))
    # This effectively removes all special characters without removing other
    # characters from different languages
    text = p.clean(text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    words = text.split()
    # remove leading and trailing white spaces with strip()
    # remove custom stopwords from text
    cleaned_words = []
    for word in words:
        word = word.strip()

        # This will further cleaning some key words by making them uniform
        if word in stop_words:
            continue
        if word == "malig":
            word = "malignant"
        elif word == "neoplm":
            word = "neoplasm"
        
        cleaned_words.append(word)

    text = ' '.join(cleaned_words)
    # Replace multiple consecutive white spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

def perform_pca_on_model(model_name, model):
    pca = PCA()
    pipe = Pipeline(steps=[("pca", pca), (model_name, model)])
    return pipe, 

def scale_and_combine_df(df_numeric, df_non_numeric):
    x = df_numeric.values
    x_scaled = StandardScaler().fit_transform(x)
    df_numeric_scaled = pd.DataFrame(x_scaled, columns=df_numeric.columns)
    print(len(df_numeric_scaled))
    df_final = pd.concat([df_numeric_scaled, df_non_numeric], axis=1)
    return df_final

def balance_and_sample_data(df):
    '''
    This function is used to balance the dataset because the classes are heavily skewed. 
    It also takes random samples from each class. 
    '''
    # Check the counts for each class in the dataset
    '''
    1    8060
    0    4846
    '''
    print(f'The count for each class before:\n {df["DiagPeriodL90D"].value_counts()}')
    sample_true = df.loc[df['DiagPeriodL90D']==1].sample(n=4846, random_state=42)
    sample_false = df.loc[df['DiagPeriodL90D']==0].sample(n=4846, random_state=42)
    # After concatenation both classes are balanced at 4846
    balanced_df = pd.concat([sample_true,sample_false], axis=0)
    
    balanced_df = balanced_df.reset_index(drop=True)
    print(f'The count for each class after:\n {balanced_df["DiagPeriodL90D"].value_counts()}')
    return balanced_df

def preprocess_data(df):
    # Don't balance the data set because it reduces model performance
    # df_balanced = balance_and_sample_data(df)

    # The following columns are dropped because more than 50% of data is missing from these columns.
    df.drop(columns=["bmi","patient_race", "metastatic_first_novel_treatment", "metastatic_first_novel_treatment_type"], inplace=True)
    # Save Targets
    df_targets = pd.DataFrame(df["DiagPeriodL90D"])
    # Divide dataframe into numerical features and non numerical for further processing
    df.drop(columns=["DiagPeriodL90D"], inplace=True)
    df_non_numeric = df.select_dtypes(exclude=['number'])
    df_non_numeric.fillna('CC', inplace=True)
    # Clean non numeric column
    df_non_numeric.loc[:,"breast_cancer_diagnosis_desc"] = df_non_numeric["breast_cancer_diagnosis_desc"].apply(clean_text)
    # Use one hot encoding on alpha characters so that the data is interpretable by ML algorithms
    df_non_numeric = one_hot_encode(df_non_numeric, float)
    df_numeric = df.select_dtypes(include=['number'])
    # df_numeric.fillna(df_numeric.mean().round(1), inplace=True)
    # Use interpolation because this method will adjacent data to determine the value that should be used to fill empty cells
    df_numeric.interpolate(method='polynomial', inplace=True, order=2)
    # Scale the data for the algorithms to prevent bias towards features with larger values. Also regularization techniques work better after scaling. 
    df_final = scale_and_combine_df(df_numeric, df_non_numeric)
    print(df_targets.value_counts())
    return df_final, df_targets

def load_data():
    # nltk.download('stopwords')
    # df = pd.read_csv("training_wids2024C1.csv")
    # logistic_regression(df)
    # svc(df)
    # random_forest_classifier(df)
    # extra_trees_classifier(df)
    # blender(df)
    pass

def main():
    load_data()

if __name__ == "__main__":
    main()

### Support Vector Machine

In [100]:
def svc(df):
    # Only use one feature column because CountVectorizer only accepts one column at a time
    df.loc[:,"breast_cancer_diagnosis_desc"] = df["breast_cancer_diagnosis_desc"].apply(clean_text)
    df_features = df["breast_cancer_diagnosis_desc"]
    df_targets = df["DiagPeriodL90D"]
    
    # CountVectorize takes the raw frequency of each word found in each document and returns a vector matrix
    vectorized_features = CountVectorizer()
    df_features = vectorized_features.fit_transform(df_features)
    print(df_features.shape)

    X_train, X_test, y_train, y_test = train_test_split(df_features, df_targets, test_size=0.2, random_state=42)
    SVM = svm.SVC()
    
    # Random search is used before grid search to reduce search space
    # Then we use grid search with the narrowed results from Random search
    # SVM = svc_random_search(SVM, X_train, y_train)
    SVM = svc_grid_search(SVM, X_train, y_train)
    y_pred = SVM.predict(X_test)

    print_metrics(y_test, y_pred)
    save_model('saved_models/svm.sav',SVM)

def svc_random_search(SVM, X_train, y_train):
    # These are the latest results from RandomSearchCV
    '''
    Best parameter (CV score=0.810):
    {'kernel': 'rbf', 'gamma': 0.1, 'C': 10}
    '''
    param_grid = {'C': [0.1, 1, 10, 100],
              'kernel': ['linear', 'rbf', 'poly'],
              'gamma': np.logspace(-3, 2, num=6)}
    svc_model = random_search_cv(SVM,param_grid)
    svc_model.fit(X_train, y_train)
    print("Best parameter (CV score=%0.3f):" % svc_model.best_score_)
    print(svc_model.best_params_)
    return svc_model

def svc_grid_search(SVM, X_train, y_train):
    # These are the latest results from GridSearchCV
    '''
    Best parameter (CV score=0.810):
    {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
    '''
    param_grid = {'C': [1, 10, 20],
              'kernel': ['rbf'],
              'gamma': [0.1,.5,1,10]}
    
    svm_model = GridSearchCV(SVM, param_grid, n_jobs=-1)
    svm_model.fit(X_train, y_train)
    print("Best parameter (CV score=%0.3f):" % svm_model.best_score_)
    print(svm_model.best_params_)
    return svm_model

# Run SVM classifier
# Import data for training 

df = pd.read_csv("training_wids2024C1.csv")
svc(df)

(12906, 26)
Best parameter (CV score=0.810):
{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Accuracy: 0.8152594887683966
Precision: 0.790224032586558
Recall: 0.9598021026592455
F1 Score: 0.8667969840826585


### Logistic Regression

In [101]:
def logistic_regression(df):
    '''
    Best parameter (CV score=0.810):
    {'C': 10, 'max_iter': 100, 'multi_class': 'auto', 'solver': 'liblinear'}
    '''
    df_features, df_targets = preprocess_data(df)
    X_train, X_test, y_train, y_test = train_test_split(df_features.values, df_targets.values.ravel(), test_size=0.2, random_state=42)
    logistic_model = LogisticRegression(solver="liblinear", penalty='l2',C=10, multi_class='auto',max_iter=100, tol=0.1)
    logistic_model.fit(X_train, y_train)
    # logistic_model = logistic_regression_grid_search(logistic, X_train, y_train)
    y_pred = logistic_model.predict(X_test)
    print_metrics(y_test, y_pred)
    save_model('saved_models/logistic_regression.sav',logistic_model)

def logistic_regression_grid_search(logistic, X_train, y_train):
    '''
    Best parameter (CV score=0.810):
    {'C': 10, 'max_iter': 100, 'multi_class': 'auto', 'solver': 'liblinear'}
    '''
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],  
        'solver': ['newton-cg', 'lbfgs', 'liblinear'],
        'multi_class': ['auto', 'ovr'],
        'max_iter': [100, 200, 500, 1000],
    }

    logistic_model = GridSearchCV(logistic, param_grid, n_jobs=-1)
    logistic_model.fit(X_train, y_train)
    print("Best parameter (CV score=%0.3f):" % logistic_model.best_score_)
    print(logistic_model.best_params_)
    return logistic_model

# Run logistic regression classifier
df = pd.read_csv("training_wids2024C1.csv")
logistic_regression(df)

12906
DiagPeriodL90D
1                 8060
0                 4846
dtype: int64
Accuracy: 0.8140975987606507
Precision: 0.7899031106578276
Recall: 0.9579468150896723
F1 Score: 0.8658468418110675


In [102]:
def random_forest_classifier(df):
    df_features, df_targets = preprocess_data(df)
    X_train, X_test, y_train, y_test = train_test_split(df_features.values, df_targets.values.ravel(), test_size=0.2, random_state=42)
    rnd_clf = RandomForestClassifier(n_estimators=500, criterion="gini",  max_leaf_nodes=16, random_state=42, n_jobs =-1)
    # rnd_clf_model = random_forest_classifier_pipe_random_search(rnd_clf, X_train, y_train)
    rnd_clf.fit(X_train, y_train)
    y_pred=  rnd_clf.predict(X_test)
    print_metrics(y_test, y_pred)
    save_model('saved_models/random_forest_classifier.sav',rnd_clf)

def random_forest_classifier_pipe_random_search(rnd_clf, X_train, y_train):
    n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
    max_depth.append(None)
    min_samples_split = [2,5,10]
    min_samples_leaf = [1,2,4]
    bootstrap = [True, False]

    random_grid = { 'random_forest__n_estimators': n_estimators,
                    'random_forest__max_features': max_features,
                    'random_forest__max_depth': max_depth,
                    'random_forest__min_samples_split': min_samples_split,
                    'random_forest__min_samples_leaf': min_samples_leaf,
                    'random_forest__bootstrap': bootstrap}

    pipe = Pipeline(steps=[("pca",PCA()),("random_forest", rnd_clf)])
    rnd_clf_model = random_search_cv(pipe, random_grid)
    rnd_clf_model.fit(X_train, y_train)
    print("Best parameter (CV score=%0.3f):" % rnd_clf_model.best_score_)
    print(rnd_clf_model.best_params_)
    return rnd_clf_model

# Run random forest classifier 
# This algorithm achieved the second best scores. I tried using a pipeline with PCA, hoping to reduce the dimensionality.
# However, the model did not perform as well. Therefore, I did not use it in the model's final computation.  
df = pd.read_csv("training_wids2024C1.csv")
random_forest_classifier(df)

12906
DiagPeriodL90D
1                 8060
0                 4846
dtype: int64
Accuracy: 0.8164213787761425
Precision: 0.7890743550834598
Recall: 0.9647495361781077
F1 Score: 0.8681135225375627


### Extra Trees Classifier

In [103]:
def extra_trees_classifier(df):
    df_features, df_targets = preprocess_data(df)
    X_train, X_test, y_train, y_test = train_test_split(df_features.values, df_targets.values.ravel(), test_size=0.2, random_state=42)

    extra_trees_clf = ExtraTreesClassifier(n_estimators=500, criterion="gini", max_features='sqrt', min_samples_split=10, min_samples_leaf=4, max_leaf_nodes=16, random_state=42, n_jobs =-1)
    extra_trees_clf.fit(X_train, y_train)
    # extra_trees_clf = extra_trees_classifier_grid_search(X_train, y_train)
    y_pred=  extra_trees_clf.predict(X_test)
    print_metrics(y_test, y_pred)
    save_model('saved_models/extra_trees_classifier.sav',extra_trees_clf)

def extra_trees_classifier_grid_search(X_train, y_train):
    '''
    accuracy = .8168
    Best parameter (CV score=0.810):
    {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 500}
    '''
    param_grid = {
            'n_estimators': [150, 300, 500],  
            'max_depth': [None, 5, 10],       
            'min_samples_split': [2, 5, 10],  
            'min_samples_leaf': [1, 2, 4],    
            'max_features': ['auto', 'sqrt', 'log2'],  
            'bootstrap': [True, False]      
        }   
    extra_trees_clf = GridSearchCV(ExtraTreesClassifier(), param_grid=param_grid, n_jobs=-1)
    extra_trees_clf.fit(X_train, y_train)
    print("Best parameter (CV score=%0.3f):" % extra_trees_clf.best_score_)
    print(extra_trees_clf.best_params_)
    return extra_trees_clf


# Run extra trees classifier
# Overall I believe this model achieved the highest accuracy because it is an ensemble model that offers more randomness than 
# RandomForestClassifier because splits are chosen completely randomly from all features. 
df = pd.read_csv("training_wids2024C1.csv")
extra_trees_classifier(df)

12906
DiagPeriodL90D
1                 8060
0                 4846
dtype: int64
Accuracy: 0.8171959721146398
Precision: 0.789873417721519
Recall: 0.9647495361781077
F1 Score: 0.8685968819599108


In [96]:
def blender(df):

    # # Here is data preprocessing for two models
    # df_features, df_targets = preprocess_data(df)
    # X_train, X_test, y_train, y_test = train_test_split(df_features.values, df_targets.values.ravel(), test_size=0.2, random_state=42)

    # Data preprocessing for SVM 
    df.loc[:, "breast_cancer_diagnosis_desc"] = df["breast_cancer_diagnosis_desc"].apply(clean_text)
    df_features = df["breast_cancer_diagnosis_desc"]
    df_targets = df["DiagPeriodL90D"]
    
    vectorized_features = CountVectorizer()
    df_features = vectorized_features.fit_transform(df_features)
    X_train, X_test, y_train, y_test = train_test_split(df_features, df_targets, test_size=0.2, random_state=42)

    stacking_clf = StackingClassifier(
        estimators=[
            ('lr', LogisticRegression(solver="liblinear", max_iter=100, tol=0.1, random_state=42, C=10, multi_class='auto')),
            ('rf', RandomForestClassifier(n_estimators=500, criterion="gini",  max_leaf_nodes=16, random_state=42, n_jobs =-1)),
            ('svc', svm.SVC(probability=True, kernel='rbf', gamma=0.1, C=10,random_state=42))
        ]
    )
    stacking_clf.fit(X_train, y_train)
    y_pred = stacking_clf.predict(X_test)
    print_metrics(y_test, y_pred)
    save_model('saved_models/stacking_classifier.sav',stacking_clf)

# Run stacking classifier 
# This model did not perform as well as the extra trees classifier becuase I did not have enough time to optimize it. 
df = pd.read_csv("training_wids2024C1.csv")
blender(df)

Accuracy: 0.8152594887683966
Precision: 0.790224032586558
Recall: 0.9598021026592455
F1 Score: 0.8667969840826585


### References

Géron, A. (2023). Hands-on machine learning with scikit-learn, keras and tensorflow: Concepts, tools, and techniques to build Intelligent Systems. O’Reilly Media, Inc. 