In [97]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import svm
import preprocessor as p
import nltk
from nltk.corpus import stopwords
import re
import matplotlib.pyplot as plt
import numpy as np
import pickle as pkl

def svc(df):
    df["breast_cancer_diagnosis_desc"] = df["breast_cancer_diagnosis_desc"].apply(clean_text)
    df_features = df["breast_cancer_diagnosis_desc"]
    df_targets = df["DiagPeriodL90D"]
 
    vectorized_features = CountVectorizer()
    df_features = vectorized_features.fit_transform(df_features)
    print(df_features.shape)

    X_train, X_test, y_train, y_test = train_test_split(df_features, df_targets, test_size=0.2, random_state=42)
    SVM = svm.SVC()
    # SVM = svc_random_search(SVM, X_train, y_train)
    SVM = svc_grid_search(SVM, X_train, y_train)
    y_pred = SVM.predict(X_test)

    print_metrics(y_test, y_pred)
    save_model('saved_models/svm.sav',SVM)

def linear_regression(df):
    df_features, df_targets = preprocess_data(df)
    X_train, X_test, y_train, y_test = train_test_split(df_features.values, df_targets.values.ravel(), test_size=0.2, random_state=42)
    logistic = LogisticRegression(solver="lbfgs",max_iter=10000, tol=0.1)
    logistic_model = logistic_regression_grid_search(logistic, X_train, y_train)
    y_pred = logistic_model.predict(X_test)
    print_metrics(y_test, y_pred)
    save_model('saved_models/logistic_regression.sav',logistic_model)

def random_forest_classifier(df):
    df_features, df_targets = preprocess_data(df)
    X_train, X_test, y_train, y_test = train_test_split(df_features.values, df_targets.values.ravel(), test_size=0.2, random_state=42)
    rnd_clf = RandomForestClassifier(n_estimators=500, criterion="gini",  max_leaf_nodes=16, random_state=42, n_jobs =-1)
    # rnd_clf_model = random_forest_classifier_pipe_random_search(rnd_clf, X_train, y_train)
    rnd_clf.fit(X_train, y_train)
    y_pred=  rnd_clf.predict(X_test)
    print_metrics(y_test, y_pred)
    save_model('saved_models/random_forest_classifier.sav',rnd_clf)

def logistic_regression_grid_search(logistic, X_train, y_train):
    '''
    Best parameter (CV score=0.810):
    {'C': 10, 'max_iter': 100, 'multi_class': 'auto', 'solver': 'liblinear'}
    '''
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],  
        'solver': ['newton-cg', 'lbfgs', 'liblinear'],
        'multi_class': ['auto', 'ovr'],
        'max_iter': [100, 200, 500, 1000],
    }

    logistic_model = GridSearchCV(logistic, param_grid, n_jobs=-1)
    logistic_model.fit(X_train, y_train)
    print("Best parameter (CV score=%0.3f):" % logistic_model.best_score_)
    print(logistic_model.best_params_)
    return logistic_model

def random_forest_classifier_pipe_random_search(rnd_clf, X_train, y_train):
    n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
    max_depth.append(None)
    min_samples_split = [2,5,10]
    min_samples_leaf = [1,2,4]
    bootstrap = [True, False]

    random_grid = { 'random_forest__n_estimators': n_estimators,
                    'random_forest__max_features': max_features,
                    'random_forest__max_depth': max_depth,
                    'random_forest__min_samples_split': min_samples_split,
                    'random_forest__min_samples_leaf': min_samples_leaf,
                    'random_forest__bootstrap': bootstrap}
    
    pipe = Pipeline(steps=[("pca",PCA()),("random_forest", rnd_clf)])
    rnd_clf_model = random_search_cv(pipe, random_grid)
    rnd_clf_model.fit(X_train, y_train)
    print("Best parameter (CV score=%0.3f):" % rnd_clf_model.best_score_)
    print(rnd_clf_model.best_params_)
    return rnd_clf_model

def svc_random_search(SVM, X_train, y_train):
    '''
    Best parameter (CV score=0.810):
    {'kernel': 'rbf', 'gamma': 0.1, 'C': 10}
    '''
    param_grid = {'C': [0.1, 1, 10, 100],
              'kernel': ['linear', 'rbf', 'poly'],
              'gamma': np.logspace(-3, 2, num=6)}
    svc_model = random_search_cv(SVM,param_grid)
    svc_model.fit(X_train, y_train)
    print("Best parameter (CV score=%0.3f):" % svc_model.best_score_)
    print(svc_model.best_params_)
    return svc_model

def svc_grid_search(SVM, X_train, y_train):
    '''
    Best parameter (CV score=0.810):
    {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
    '''
    param_grid = {'C': [1, 10, 20],
              'kernel': ['rbf'],
              'gamma': [0.1,.5,1,10]}
    
    svm_model = GridSearchCV(SVM, param_grid, n_jobs=-1)
    svm_model.fit(X_train, y_train)
    print("Best parameter (CV score=%0.3f):" % svm_model.best_score_)
    print(svm_model.best_params_)
    return svm_model
    

def print_metrics(y_test, y_pred):

    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)

    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')

def save_model(file_name, model):
    pkl.dump(model, open(file_name, 'wb'))

def load_model(file_name):
    return pkl.load(open(file_name, 'rb'))

def one_hot_encode(df, type):
    return pd.get_dummies(df, dtype=type)

def random_search_cv(estimator, random_grid):
    return RandomizedSearchCV(estimator=estimator, param_distributions = random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs =-1)

def clean_text(text):
    stop_words = set(stopwords.words('english'))
    # This effectively removes all special characters without removing other
    # characters from different languages
    text = p.clean(text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    words = text.split()
    # remove leading and trailing white spaces with strip()
    # remove custom stopwords from text
    cleaned_words = []
    for word in words:
        word = word.strip()

        if word in stop_words:
            continue
        if word == "malig":
            word = "malignant"
        elif word == "neoplm":
            word = "neoplasm"
        
        cleaned_words.append(word)

    text = ' '.join(cleaned_words)
    # Replace multiple consecutive white spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

def perform_pca_on_model(model_name, model):
    pca = PCA()
    pipe = Pipeline(steps=[("pca", pca), (model_name, model)])
    return pipe, 

def scale_and_combine_df(df_numeric, df_non_numeric):
    x = df_numeric.values
    x_scaled = StandardScaler().fit_transform(x)
    df_numeric_scaled = pd.DataFrame(x_scaled, columns=df_numeric.columns)
    df_final = pd.concat([df_numeric_scaled, df_non_numeric], axis=1)
    return df_final

def preprocess_data(df):   
    df.drop(columns=["bmi", "patient_race", "metastatic_first_novel_treatment", "metastatic_first_novel_treatment_type"], inplace=True)
    df_targets = pd.DataFrame(df["DiagPeriodL90D"])
    df.drop(columns=["DiagPeriodL90D"], inplace=True)
    df_non_numeric = df.select_dtypes(exclude=['number'])
    df_non_numeric.fillna('CC', inplace=True)
    # Clean non numeric column
    df_non_numeric["breast_cancer_diagnosis_desc"] = df_non_numeric["breast_cancer_diagnosis_desc"].apply(clean_text)
    df_non_numeric = one_hot_encode(df_non_numeric, float)

    df_numeric = df.select_dtypes(include=['number'])
    # df_numeric.fillna(df_numeric.mean().round(1), inplace=True)
    df_numeric.interpolate(method='polynomial', inplace=True, order=2)

    df_final = scale_and_combine_df(df_numeric, df_non_numeric)
    return df_final, df_targets

def load_data():
    nltk.download('stopwords')
    df = pd.read_csv("training_wids2024C1.csv")
    linear_regression(df)
    # random_forest_classifier(df)
    # svc(df)


def main():
    load_data()

if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_non_numeric["breast_cancer_diagnosis_desc"] = df_non_numeric["breast_cancer_diagnosis_desc"].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numeric.interpolate(method='polynomial', inplace=True

Best parameter (CV score=0.810):
{'C': 10, 'max_iter': 100, 'multi_class': 'auto', 'solver': 'liblinear'}
Accuracy: 0.8140975987606507
Precision: 0.7899031106578276
Recall: 0.9579468150896723
F1 Score: 0.8658468418110675
