In [1]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from statsmodels.discrete.discrete_model import Logit
from utility_functions import load_file, pickle_file, starting_run, finished_run
from analysis_variables import logreg_targets

In [2]:
filtered_data = load_file("summary_costs_enhanced.pickle")
category_status = load_file("category_status_filtered.pickle")
pca = PCA(n_components=10)
enc = OneHotEncoder(sparse=False)
scaler = StandardScaler()
mScaler = MinMaxScaler()
if not os.path.isdir(f"../tables/logreg"):
    os.mkdir(f"../tables/logreg")

In [3]:
demographic_cols = ['marital_status', 'initial_discharge_quarter', 'gender', 'race', 'payer']
def encode_dataset(dataset):
    encoded_dataset = pd.DataFrame(
        enc.fit_transform(dataset), dataset.index
    )
    encoded_dataset.columns = enc.get_feature_names(dataset.columns)
    return encoded_dataset
def preprocess_dataset(dataset):
    #encode dataset demographics
    dem_dataset = dataset.loc[:, demographic_cols].dropna()
    encoded_dataset = encode_dataset(dem_dataset).join(dataset["age"], how='inner').join(category_status, how="inner")
    #scale columns
    scaled_data = pd.DataFrame(
        scaler.fit_transform(encoded_dataset),
        index = encoded_dataset.index,
        columns = encoded_dataset.columns
    )
    pca_data = pd.DataFrame(
        mScaler.fit_transform(encoded_dataset),
        index = encoded_dataset.index,
        columns = encoded_dataset.columns
    )
    return scaled_data, pca_data

In [4]:
def run_logreg(dataset, target):
    return Logit(target, dataset).fit_regularized(maxiter=1000).summary2().tables[1].sort_values(['P>|z|', 'Coef.'])

In [5]:
def run_PCA(dataset):
    print(dataset.shape)
    fitted_model = pca.fit(dataset)
    return pd.DataFrame(scaler.fit_transform(fitted_model.transform(dataset)), index=dataset.index), fitted_model.explained_variance_ratio_, pd.DataFrame(fitted_model.components_.T, index=dataset.columns)

In [6]:
datasets = {
    "Full": filtered_data,
#     "Inflamed": filtered_data.loc[category_status["has biliary colic with inflammation"].eq(1)],
#     "Uninflamed": filtered_data.loc[category_status["has biliary colic with inflammation"].eq(0)]
}
explained_variance = {}
for name, dataset in datasets.items():
    starting_run(name)
    scaled_data, pca_data = preprocess_dataset(dataset)
    pca_dataset, component_importance, component_eigenvalues = run_PCA(pca_data)
    dataset = dataset.loc[scaled_data.index]
    explained_variance[name] = component_importance
#     if name != "Full":
#         del scaled_data["has biliary colic with inflammation"]
    for target_name, target_function in logreg_targets.items():
        starting_run(target_name)
        target_data = target_function(dataset)
        print(target_data.value_counts())
        run_logreg(scaled_data.loc[target_data.index], target_data).to_csv(f"../tables/logreg/{name} {target_name} Feature Scores.csv")
        run_logreg(pca_dataset.loc[target_data.index], target_data).to_csv(f"../tables/logreg/{name} {target_name} PCA Component Scores.csv")
    component_eigenvalues.to_csv(f"../tables/logreg/{name} PCA eigenvalues.csv")
#     pickle_file(f"{name}_logreg_targets.pickle", targets)
pd.DataFrame(explained_variance).to_csv(f"../tables/logreg/PCA explained variance.csv")

Starting Full 14:53:18.754429
(3965, 34)
Starting Surgery vs No Surgery 14:53:18.839076
False    2892
True     1073
Name: surgery_type, dtype: int64
Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.5504399314371905
            Iterations: 30
            Function evaluations: 31
            Gradient evaluations: 30
Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.6134102756192087
            Iterations: 16
            Function evaluations: 16
            Gradient evaluations: 16
Starting Given Surgery - Emergency vs Delayed 14:53:18.939295
True     2643
False     249
Name: surgery_type, dtype: int64
Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.4752157310737036
            Iterations: 33
            Function evaluations: 33
            Gradient evaluations: 33
Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.616752