<a href="https://colab.research.google.com/github/philipp-lampert/mymandible/blob/main/data_science/05_logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Logistic Regression

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

df_cca = pd.read_parquet('https://github.com/philipp-lampert/mymandible/blob/main/data_science/data/dropped_first_cca.parquet?raw=true')
df_imp = pd.read_parquet('https://github.com/philipp-lampert/mymandible/blob/main/data_science/data/dropped_first_imputed.parquet?raw=true')

##Complete case analysis (CCA)

In [2]:
df_cca['soft_tissue_complication'].value_counts()

True     77
False    76
Name: soft_tissue_complication, dtype: Int64

In [3]:
first_outcome_var = df_cca.columns.get_loc('flap_revision')
predictors = df_cca.columns[:first_outcome_var].tolist()

df_cca_st_cx = df_cca[predictors].join(df_cca['soft_tissue_complication'])
df_cca_st_cx = df_cca_st_cx.dropna()

x_train, x_test, y_train, y_test = train_test_split(df_cca_st_cx[predictors], df_cca_st_cx['soft_tissue_complication'].astype('int').values, test_size=0.2, random_state=0)

In [9]:
log_reg = LogisticRegression(random_state=0, max_iter=500).fit(x_train, y_train)
predictions = log_reg.predict_proba(x_test)

brier_score = np.mean((predictions[:, 1] - y_test) ** 2)

baseline_prob = np.mean(y_train)
predictions_baseline = np.ones_like(predictions) * [1 - baseline_prob, baseline_prob]
brier_score_baseline = np.mean((predictions_baseline[:, 1] - y_test) ** 2)
brier_skill_score = 1 - (brier_score / brier_score_baseline)

print("Brier Score:", brier_score)
print("Brier Skill Score:", brier_skill_score)

Brier Score: 0.228410432472482
Brier Skill Score: 0.08635827011007202


Let's see if we can improve those metrics.

In [10]:
log_reg_cv = LogisticRegressionCV(random_state=0, max_iter=500, cv=10).fit(x_train, y_train)
predictions = log_reg_cv.predict_proba(x_test)

brier_score = np.mean((predictions[:, 1] - y_test) ** 2)

baseline_prob = np.mean(y_train)
predictions_baseline = np.ones_like(predictions) * [1 - baseline_prob, baseline_prob]
brier_score_baseline = np.mean((predictions_baseline[:, 1] - y_test) ** 2)
brier_skill_score = 1 - (brier_score / brier_score_baseline)

print("Brier Score:", brier_score)
print("Brier Skill Score:", brier_skill_score)

Brier Score: 0.2141610292966123
Brier Skill Score: 0.14335588281355083
