In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.linear_model import LogisticRegressionCV

# Choosing the PCOS dataset
- `pcosyang2024`: individual data meta analysis on 14 cohort studies
- `pcosli2023`: systematic review and meta analysis on 28 studies
- `pcossolaleyva2023`: systematic review and meta analysis on 17 gut microbe studies (investigated oral cavity, blood, vagina/cervix, gut sites)

In [2]:
# pcosyang2024

pcosyang2024 = pd.read_excel("../data/raw/pcosyang2024.xlsx", engine="openpyxl")
pcosyang2024

Unnamed: 0,sample,SRR4457864,SRR4457873,SRR4457876,SRR4457877,SRR4457878,SRR4457879,SRR4457880,SRR4457881,SRR4457882,...,SRR24874396,SRR24874397,SRR24874398,SRR24874399,SRR24874400,SRR24874401,SRR24874402,SRR24874404,SRR24874405,SRR24874406
0,group,HC,HC,HC,HC,HC,HC,HC,HC,HC,...,PCOS,PCOS,PCOS,PCOS,PCOS,PCOS,PCOS,PCOS,PCOS,PCOS
1,region,Europe,Europe,Europe,Europe,Europe,Europe,Europe,Europe,Europe,...,Asia,Asia,Asia,Asia,Asia,Asia,Asia,Asia,Asia,Asia
2,T,HC,HC,HC,HC,HC,HC,HC,HC,HC,...,,,,,,,,,,
3,01D2Z36,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0319-6G20,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1127,Yersinia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1128,Youngiibacter,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1129,Zavarzinia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1130,Zoogloea,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# transpose and set columns

pcostest = pcosyang2024.T
pcostest.columns = pcostest.iloc[0, :]
pcostest = pcostest.iloc[1:, :]
pcostest

sample,group,region,T,01D2Z36,0319-6G20,0319-7L14,27F-1492R,37-13,67-14,A2,...,WPS-2,WWH38,Xanthomonas,Xenorhabdus,Xylanibacillus,Yersinia,Youngiibacter,Zavarzinia,Zoogloea,ZOR0006
SRR4457864,HC,Europe,HC,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR4457873,HC,Europe,HC,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR4457876,HC,Europe,HC,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR4457877,HC,Europe,HC,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR4457878,HC,Europe,HC,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR24874401,PCOS,Asia,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR24874402,PCOS,Asia,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR24874404,PCOS,Asia,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR24874405,PCOS,Asia,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# split based on group

pcostesthc = pcostest[pcostest['group'] == 'HC']
pcostestpcos = pcostest[pcostest['group'] == 'PCOS']

In [22]:
def run_logistic_lasso(random_state):
    pcostestloglasso = pd.concat([pcostesthc.sample(100, random_state=random_state), pcostestpcos.sample(100, random_state=random_state)])
    
    X = pcostestloglasso
    X['region'] = X['region'].apply(lambda x: 0 if x == 'Europe' else 1)
    X = X.iloc[:, 1:]
    X = X.drop(columns=['T'])
    
    Y = pcostestloglasso['group'].apply(lambda x: 0 if x == 'HC' else 1)

    # smaller Cs = stronger regularization
    model = LogisticRegressionCV(
        Cs = np.linspace(0.01, 1, 20),
        max_iter = 10000,
        cv = 5,
        penalty='l1',
        solver='liblinear')
    
    model_def = LogisticRegressionCV(
        Cs = 20,
        cv = 5,
        max_iter = 10000,
        penalty='l1',
        solver='liblinear')
    
    model.fit(X, Y)
    model_def.fit(X, Y)

    def no_nonzero_covariates(coef):
        return np.sum(np.abs(coef) != 0)

    print("Constrained-Cs-below-1 model")
    print("... Number of Nonzero Covariates: ", no_nonzero_covariates(model.coef_))
    print("... Best C: ", model.C_)
    print("... Model Score On Full Training: ", model.score(X, Y), "\n")
    print("Default-Cs model")
    print("... Number of Nonzero Covariates: ", no_nonzero_covariates(model_def.coef_))
    print("... Best C: ", model_def.C_)
    print("... Model Score on Full Training: ", model_def.score(X, Y))
        

In [26]:
random.seed(1)
run_logistic_lasso(1)
print("\n\n")
run_logistic_lasso(100)
print("\n\n")
run_logistic_lasso(345)

Constrained-Cs-below-1 model
... Number of Nonzero Covariates:  29
... Best C:  [0.11421053]
... Model Score On Full Training:  0.77 

Default-Cs model
... Number of Nonzero Covariates:  12
... Best C:  [0.03359818]
... Model Score on Full Training:  0.685



Constrained-Cs-below-1 model
... Number of Nonzero Covariates:  60
... Best C:  [0.27052632]
... Model Score On Full Training:  0.82 

Default-Cs model
... Number of Nonzero Covariates:  111
... Best C:  [11.28837892]
... Model Score on Full Training:  1.0



Constrained-Cs-below-1 model
... Number of Nonzero Covariates:  45
... Best C:  [0.27052632]
... Model Score On Full Training:  0.77 

Default-Cs model
... Number of Nonzero Covariates:  44
... Best C:  [0.23357215]
... Model Score on Full Training:  0.77


# SCRATCH

In [55]:
# create holdout dataset to train logistic regression model w/ LASSO penalty

pcostesthc = pcostest[pcostest['group'] == 'HC']
pcostestpcos = pcostest[pcostest['group'] == 'PCOS']
pcostestloglasso = pd.concat([pcostesthc.sample(100), pcostestpcos.sample(100)])

In [56]:
# numerize all other variables, create feature matrix

X = pcostestloglasso
X['region'] = X['region'].apply(lambda x: 0 if x == 'Europe' else 1)
X = X.iloc[:, 1:]
X = X.drop(columns=['T'])
X

sample,region,01D2Z36,0319-6G20,0319-7L14,27F-1492R,37-13,67-14,A2,Abiotrophia,Absconditabacteriales_(SR1),...,WPS-2,WWH38,Xanthomonas,Xenorhabdus,Xylanibacillus,Yersinia,Youngiibacter,Zavarzinia,Zoogloea,ZOR0006
SRR17174561,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR22346267,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR4159408,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR2168049,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR24874387,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR4159412,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR22255643,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR17174510,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR2168041,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
# numerize outcome vector

Y = pcostestloglasso['group'].apply(lambda x: 0 if x == 'HC' else 1)
Y

SRR17174561    0
SRR22346267    0
SRR4159408     0
ERR2168049     0
SRR24874387    0
              ..
SRR4159412     1
SRR22255643    1
SRR17174510    1
ERR2168041     1
SRR23476479    1
Name: group, Length: 200, dtype: int64

In [89]:
# run logistic regression with l1 penalty (LASSO on classification)

# smaller Cs = stronger regularization
model = LogisticRegressionCV(
    Cs = np.linspace(0.01, 1, 20),
    max_iter = 10000,
    cv = 5,
    penalty='l1',
    solver='liblinear')

model_def = LogisticRegressionCV(
    Cs = 20,
    cv = 5,
    max_iter = 10000,
    penalty='l1',
    solver='liblinear')

model.fit(X, Y)
model_def.fit(X, Y)

In [94]:
# number of nonzero covariates

def no_nonzero_covariates(coef):
    return np.sum(np.abs(coef) != 0)

print("Constrained Cs below 1 model: ", no_nonzero_covariates(model.coef_))
print("... Best C: ", model.C_)
print("... Model Score On Full Training: ", model.score(X, Y))
print("Default Cs model: ", no_nonzero_covariates(model_def.coef_))
print("... Best C: ", model_def.C_)
print("... Model Score on Full Training: ", model_def.score(X, Y))

Constrained Cs below 1 model:  40
... Best C:  [0.16631579]
... Model Score On Full Training:  0.77
Default Cs model:  46
... Best C:  [0.23357215]
... Model Score on Full Training:  0.775


In [91]:
model.score(X, Y)

0.77

In [92]:
model.scores_

{np.int64(1): array([[0.475, 0.7  , 0.65 , 0.725, 0.7  , 0.7  , 0.675, 0.65 , 0.65 ,
         0.625, 0.6  , 0.5  , 0.475, 0.45 , 0.425, 0.375, 0.375, 0.4  ,
         0.4  , 0.4  ],
        [0.45 , 0.5  , 0.625, 0.725, 0.7  , 0.725, 0.7  , 0.65 , 0.65 ,
         0.65 , 0.6  , 0.6  , 0.5  , 0.525, 0.55 , 0.55 , 0.575, 0.55 ,
         0.55 , 0.575],
        [0.5  , 0.55 , 0.475, 0.55 , 0.55 , 0.55 , 0.575, 0.575, 0.575,
         0.575, 0.55 , 0.55 , 0.55 , 0.55 , 0.525, 0.5  , 0.5  , 0.5  ,
         0.5  , 0.5  ],
        [0.35 , 0.45 , 0.475, 0.55 , 0.575, 0.575, 0.55 , 0.55 , 0.6  ,
         0.575, 0.575, 0.55 , 0.55 , 0.55 , 0.575, 0.575, 0.575, 0.55 ,
         0.525, 0.5  ],
        [0.5  , 0.625, 0.675, 0.625, 0.6  , 0.6  , 0.6  , 0.575, 0.575,
         0.575, 0.6  , 0.65 , 0.65 , 0.675, 0.675, 0.65 , 0.65 , 0.65 ,
         0.65 , 0.65 ]])}

In [93]:
model.Cs_

array([0.01      , 0.06210526, 0.11421053, 0.16631579, 0.21842105,
       0.27052632, 0.32263158, 0.37473684, 0.42684211, 0.47894737,
       0.53105263, 0.58315789, 0.63526316, 0.68736842, 0.73947368,
       0.79157895, 0.84368421, 0.89578947, 0.94789474, 1.        ])

In [83]:
model.C_

array([0.23357215])

In [113]:
loglassocoefs[loglassocoefs!=0]

array([1.09167206e-03, 7.00186708e-03, 1.10214697e-02, 1.17485827e-02,
       1.29336163e-02, 1.74380774e-02, 1.91146009e-02, 2.06445442e-02,
       3.22982113e-02, 3.25036809e-02, 3.85776633e-02, 3.86565965e-02,
       4.40085667e-02, 4.74012980e-02, 5.11742948e-02, 5.74871930e-02,
       5.92653719e-02, 6.16917010e-02, 6.20675585e-02, 6.49027358e-02,
       7.04982764e-02, 7.18919936e-02, 7.21916139e-02, 7.82333495e-02,
       8.28356670e-02, 8.55529989e-02, 8.95057794e-02, 9.24180804e-02,
       9.45023599e-02, 9.50926419e-02, 1.18174301e-01, 1.32580978e-01,
       1.47692274e-01, 1.49950860e-01, 1.54476435e-01, 1.55527812e-01,
       1.59365623e-01, 1.71761267e-01, 1.71849356e-01, 1.84177022e-01,
       2.00177110e-01, 2.08945947e-01, 2.34747000e-01, 2.40843867e-01,
       2.42182590e-01, 2.50509090e-01, 2.51476018e-01, 2.53474957e-01,
       2.92994355e-01, 3.06159856e-01, 3.20157075e-01, 3.36272145e-01,
       3.47431398e-01, 3.54554554e-01, 3.76838582e-01, 4.07142458e-01,
      