In [1]:
import numpy as np
import pandas as pd
from logistic_regression import LogisticRegression





In [2]:
# Read the data
data = pd.read_csv("../dat/preprocessed_brca_metabric_clinical_data.tsv", sep='\t')

# features for multiple logistic regression
features = ['Age at Diagnosis', 'ER Status Binary', 'Neoplasm Histologic Grade',
                'HER2 Status Binary', 'Lymph nodes examined positive', 'Mutation Count', 'PR Status Binary', 'Tumor Size', 'Tumor Stage', 'Cancer Type Detailed Encoded', 'Integrative Cluster Encoded']


interventions = ['Chemotherapy', 'Chemotherapy Multiple', 'Radio Therapy', 'Radio Therapy Multiple', 'Surgery', 'Surgery Multiple', 'Hormone Therapy', 'Hormone Therapy Multiple']
measurements = ['Test Accuracy', 'F1 Score', 'Pseudo R2']

# Dataframe to store test accuracy, F1 Score, and Pseudo R2
result_df = pd.DataFrame(index=interventions, columns=measurements)

# Dataframe to store p-values for the single logistic regression
p_value_df_single = pd.DataFrame(index=['Chemotherapy', 'Radio Therapy', 'Surgery', 'Hormone Therapy'] , columns=['Age at Diagnosis'])

# Dataframe to store p-values for the multiple logistic regression
p_value_df_multiple = pd.DataFrame(index=['Chemotherapy Multiple', 'Radio Therapy Multiple', 'Surgery Multiple', 'Hormone Therapy Multiple'] , columns=features)

# 1. Logistic regression

## 1.1 Chemotherapy

In [3]:
# LogReg single
p_values, test_accuracy, f1_score, conf_matrix, prsquared = LogisticRegression(data, 'Chemotherapy Binary', ['Age at Diagnosis'])

# Save results
result_df.loc['Chemotherapy'] = test_accuracy , f1_score, prsquared
p_value_df_single.loc['Chemotherapy'] = str(*p_values.values)

print(display(conf_matrix))

Optimization terminated successfully.
         Current function value: 0.498638
         Iterations 5


Unnamed: 0,Pred 0,Pred 1
True 0,265,0
True 1,61,0


None


In [4]:
# LogReg multiple
p_values, test_accuracy, f1_score, conf_matrix, prsquared = LogisticRegression(data, 'Chemotherapy Binary', features)

result_df.loc['Chemotherapy Multiple'] = test_accuracy , f1_score, prsquared
p_value_df_multiple.loc['Chemotherapy Multiple'] = [str(value) for value in p_values.values]

print(display(conf_matrix))

Optimization terminated successfully.
         Current function value: 0.280119
         Iterations 8


Unnamed: 0,Pred 0,Pred 1
True 0,251,14
True 1,19,42


None


## 1.2 Radio Therapy

In [5]:
# LogReg single
p_values, test_accuracy, f1_score, conf_matrix, prsquared = LogisticRegression(data, 'Radio Therapy Binary', ['Age at Diagnosis'])

# Save results
result_df.loc['Radio Therapy'] = test_accuracy , f1_score, prsquared
p_value_df_single.loc['Radio Therapy'] = str(*p_values.values)

print(display(conf_matrix))

Optimization terminated successfully.
         Current function value: 0.640830
         Iterations 4


Unnamed: 0,Pred 0,Pred 1
True 0,0,115
True 1,0,211


None


In [6]:
# LogReg multiple
p_values, test_accuracy, f1_score, conf_matrix, prsquared = LogisticRegression(data, 'Radio Therapy Binary', features)

result_df.loc['Radio Therapy Multiple'] = test_accuracy , f1_score, prsquared
p_value_df_multiple.loc['Radio Therapy Multiple'] = [str(value) for value in p_values.values]

print(display(conf_matrix))

Optimization terminated successfully.
         Current function value: 0.600399
         Iterations 6


Unnamed: 0,Pred 0,Pred 1
True 0,13,102
True 1,24,187


None


## 1.3 Hormone Therapy

In [7]:
# LogReg single
p_values, test_accuracy, f1_score, conf_matrix, prsquared = LogisticRegression(data, 'Hormone Therapy Binary', ['Age at Diagnosis'])

# Save results
result_df.loc['Hormone Therapy'] = test_accuracy , f1_score, prsquared
p_value_df_single.loc['Hormone Therapy'] = str(*p_values.values)

print(display(conf_matrix))

Optimization terminated successfully.
         Current function value: 0.661078
         Iterations 4


Unnamed: 0,Pred 0,Pred 1
True 0,0,127
True 1,0,199


None


In [8]:
# LogReg multiple
p_values, test_accuracy, f1_score, conf_matrix, prsquared = LogisticRegression(data, 'Hormone Therapy Binary', features)

result_df.loc['Hormone Therapy Multiple'] = test_accuracy , f1_score, prsquared
p_value_df_multiple.loc['Hormone Therapy Multiple'] = [str(value) for value in p_values.values]

print(display(conf_matrix))

Optimization terminated successfully.
         Current function value: 0.544613
         Iterations 6


Unnamed: 0,Pred 0,Pred 1
True 0,55,72
True 1,25,174


None


## 1.4 Surgery

In [9]:
# LogReg single
p_values, test_accuracy, f1_score, conf_matrix, prsquared = LogisticRegression(data,'Type of Breast Surgery Binary', ['Age at Diagnosis'])

# Save results
result_df.loc['Surgery'] = test_accuracy, f1_score, prsquared
p_value_df_single.loc['Surgery'] = str(*p_values.values)

print(display(conf_matrix))

Optimization terminated successfully.
         Current function value: 0.680156
         Iterations 4


Unnamed: 0,Pred 0,Pred 1
True 0,0,141
True 1,0,185


None


In [10]:
# LogReg multiple
p_values, test_accuracy, f1_score, conf_matrix, prsquared = LogisticRegression(data, 'Type of Breast Surgery Binary', features)

result_df.loc['Surgery Multiple'] = test_accuracy , f1_score, prsquared
p_value_df_multiple.loc['Surgery Multiple'] = [str(value) for value in p_values.values]

print(display(conf_matrix))

Optimization terminated successfully.
         Current function value: 0.633384
         Iterations 6


Unnamed: 0,Pred 0,Pred 1
True 0,66,75
True 1,52,133


None


In [15]:
result_df.to_latex('../doc/results_logReg.tex',multicolumn=True)
result_df

Unnamed: 0,Test Accuracy,F1 Score,Pseudo R2
Chemotherapy,0.81,0.0,0.078
Chemotherapy Multiple,0.9,0.72,0.482
Radio Therapy,0.65,0.79,-0.015
Radio Therapy Multiple,0.61,0.75,0.049
Surgery,0.57,0.72,0.006
Surgery Multiple,0.61,0.68,0.074
Hormone Therapy,0.61,0.76,0.019
Hormone Therapy Multiple,0.7,0.78,0.192


In [12]:
p_value_df_multiple

Unnamed: 0,Age at Diagnosis,ER Status Binary,Neoplasm Histologic Grade,HER2 Status Binary,Lymph nodes examined positive,Mutation Count,PR Status Binary,Tumor Size,Tumor Stage,Cancer Type Detailed Encoded,Integrative Cluster Encoded
Chemotherapy Multiple,6.4148207137044375e-28,6.246802998005148e-09,6.283314593160211e-10,0.3794909908007046,0.0019166295749873,1.6100523545760406e-05,0.9812902528206324,0.0410446104919465,3.6731236882758643e-13,0.2420618807022153,0.4689598416560797
Radio Therapy Multiple,0.0020067832353972,0.6256235237930078,1.7587082147643826e-07,0.3676513159322203,0.0085557732491643,0.0237922722199826,0.2156503444007523,0.4277463824163282,0.0130135221565432,0.3385510361840903,0.4653256321571176
Surgery Multiple,0.0576186034765876,0.6031676092671987,0.0019254683689583,0.1335838086462859,0.000310640377013,0.3345969122163638,0.0505207635956638,5.768428269167222e-06,0.9953143861978552,0.3145394174118685,0.1473501163020759
Hormone Therapy Multiple,0.1981864792272414,1.7649020752995132e-14,0.0624329043704521,0.6145185709946649,9.304900245363282e-05,2.7917868150153e-12,0.1796203469371593,0.929774340413748,0.017882202956569,0.187608455246543,7.945613331913703e-05


In [13]:
p_value_df_single

Unnamed: 0,Age at Diagnosis
Chemotherapy,9.389978889382199e-66
Radio Therapy,1.1034480101173352e-22
Surgery,5.659892031625093e-07
Hormone Therapy,7.825316561842639e-15
