In [47]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
import pickle as pkl


def linear_regression(df, model):
    df_features, df_targets = preprocess_data(df, model)
    X_train, X_test, y_train, y_test = train_test_split(df_features.values, df_targets.values.ravel(), test_size=0.2, random_state=42)
    logistic = LogisticRegression(solver="lbfgs",max_iter=10000, tol=0.1)

    param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'multi_class': ['auto', 'ovr'],
    'max_iter': [100, 200, 500, 1000],
    }

    logistic_model = GridSearchCV(logistic, param_grid, n_jobs=-1)
    logistic_model.fit(X_train, y_train)

    print("Best parameter (CV score=%0.3f):" % logistic_model.best_score_)
    print(logistic_model.best_params_)
    accuracy = logistic_model.score(X_test, y_test)
    print("Accuracy on the testing data:", accuracy)
    save_model('logistic_regression.sav',logistic_model)

def random_forest_classifier(df, model):
    df_features, df_targets = preprocess_data(df, model)
    X_train, X_test, y_train, y_test = train_test_split(df_features.values, df_targets.values.ravel(), test_size=0.2, random_state=42)
    rnd_clf = RandomForestClassifier(n_estimators=500, criterion="gini",  max_leaf_nodes=16, random_state=42, n_jobs =-1)

    # pipe = Pipeline(steps=[("pca",PCA()),("random_forest", rnd_clf)])
    # random_grid = random_forest_classifier_grid()
    # rnd_clf_model = random_search_cv(pipe, random_grid)
    # print("Best parameter (CV score=%0.3f):" % rnd_clf_model.best_score_)
    # print(rnd_clf_model.best_params_)

    rnd_clf.fit(X_train, y_train)
    accuracy = rnd_clf.score(X_test, y_test)
    print("Accuracy on the testing data:", accuracy)
    save_model('random_forest_classifier.sav',rnd_clf)

def random_forest_classifier_grid():
    n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
    max_depth.append(None)
    min_samples_split = [2,5,10]
    min_samples_leaf = [1,2,4]
    bootstrap = [True, False]

    random_grid = { 'random_forest__n_estimators': n_estimators,
                    'random_forest__max_features': max_features,
                    'random_forest__max_depth': max_depth,
                    'random_forest__min_samples_split': min_samples_split,
                    'random_forest__min_samples_leaf': min_samples_leaf,
                    'random_forest__bootstrap': bootstrap}
    return random_grid

def save_model(file_name, model):
    pkl.dump(model, open(file_name, 'wb'))

def load_model(file_name):
    return pkl.load(open(file_name, 'rb'))

def one_hot_encode(df, type):
    return pd.get_dummies(df, dtype=type)

def random_search_cv(estimator, random_grid):
    return RandomizedSearchCV(estimator=estimator, param_distributions = random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs =-1)

def perform_pca_on_model(model_name, model):
    pca = PCA()
    pipe = Pipeline(steps=[("pca", pca), (model_name, model)])
    return pipe, 

def scale_and_combine_df(df_numeric, df_non_numeric):
    x = df_numeric.values
    x_scaled = StandardScaler().fit_transform(x)
    df_numeric_scaled = pd.DataFrame(x_scaled, columns=df_numeric.columns)
    df_final = pd.concat([df_numeric_scaled, df_non_numeric], axis=1)
    return df_final

def preprocess_data(df, model):   
    df.drop(columns=["bmi", "patient_race", "metastatic_first_novel_treatment", "metastatic_first_novel_treatment_type"], inplace=True)
    df_targets = pd.DataFrame(df["DiagPeriodL90D"])
    df.drop(columns=["DiagPeriodL90D"], inplace=True)
    df_non_numeric = df.select_dtypes(exclude=['number'])
    df_non_numeric.fillna('NA', inplace=True)
    df_numeric = df.select_dtypes(include=['number'])

    # df_numeric.fillna(df_numeric.mean().round(1), inplace=True)
    df_numeric.interpolate(method='polynomial', inplace=True, order=2)
    df_non_numeric = one_hot_encode(df_non_numeric, float)

    df_final = scale_and_combine_df(df_numeric, df_non_numeric)
    return df_final, df_targets

def load_data():
    df = pd.read_csv("training_wids2024C1.csv")
    # linear_regression(df, "linear regression")
    random_forest_classifier(df, "random forest classifier")


def main():
    load_data()

if __name__ == "__main__":
    main()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numeric.interpolate(method='polynomial', inplace=True, order=2)


Accuracy on the testing data: 0.8164213787761425
