In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegressionCV

# Choosing the PCOS dataset
- `pcosyang2024`: individual data meta analysis on 14 cohort studies
- `pcosli2023`: systematic review and meta analysis on 28 studies
- `pcossolaleyva2023`: systematic review and meta analysis on 17 gut microbe studies (investigated oral cavity, blood, vagina/cervix, gut sites)

In [5]:
# pcosyang2024
pcosyang2024 = pd.read_excel("../data/raw/pcosyang2024.xlsx", engine="openpyxl")
pcosyang2024

Unnamed: 0,sample,SRR4457864,SRR4457873,SRR4457876,SRR4457877,SRR4457878,SRR4457879,SRR4457880,SRR4457881,SRR4457882,...,SRR24874396,SRR24874397,SRR24874398,SRR24874399,SRR24874400,SRR24874401,SRR24874402,SRR24874404,SRR24874405,SRR24874406
0,group,HC,HC,HC,HC,HC,HC,HC,HC,HC,...,PCOS,PCOS,PCOS,PCOS,PCOS,PCOS,PCOS,PCOS,PCOS,PCOS
1,region,Europe,Europe,Europe,Europe,Europe,Europe,Europe,Europe,Europe,...,Asia,Asia,Asia,Asia,Asia,Asia,Asia,Asia,Asia,Asia
2,T,HC,HC,HC,HC,HC,HC,HC,HC,HC,...,,,,,,,,,,
3,01D2Z36,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0319-6G20,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1127,Yersinia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1128,Youngiibacter,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1129,Zavarzinia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1130,Zoogloea,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
pcostest = pcosyang2024.T
pcostest.columns = pcostest.iloc[0, :]
pcostest = pcostest.iloc[1:, :]
pcostest

sample,group,region,T,01D2Z36,0319-6G20,0319-7L14,27F-1492R,37-13,67-14,A2,...,WPS-2,WWH38,Xanthomonas,Xenorhabdus,Xylanibacillus,Yersinia,Youngiibacter,Zavarzinia,Zoogloea,ZOR0006
SRR4457864,HC,Europe,HC,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR4457873,HC,Europe,HC,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR4457876,HC,Europe,HC,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR4457877,HC,Europe,HC,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR4457878,HC,Europe,HC,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR24874401,PCOS,Asia,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR24874402,PCOS,Asia,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR24874404,PCOS,Asia,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR24874405,PCOS,Asia,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
pcostesthc = pcostest[pcostest['group'] == 'HC']
pcostestpcos = pcostest[pcostest['group'] == 'PCOS']
pcostestloglasso = pd.concat([pcostesthc.sample(100), pcostestpcos.sample(100)])

In [48]:
X = pcostestloglasso
X['region'] = X['region'].apply(lambda x: 0 if x == 'Europe' else 1)
X = X.iloc[:, 1:]
X = X.drop(columns=['T'])
X

sample,region,01D2Z36,0319-6G20,0319-7L14,27F-1492R,37-13,67-14,A2,Abiotrophia,Absconditabacteriales_(SR1),...,WPS-2,WWH38,Xanthomonas,Xenorhabdus,Xylanibacillus,Yersinia,Youngiibacter,Zavarzinia,Zoogloea,ZOR0006
SRR22346206,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR14806025,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR14806079,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR13526484,1,0,0,0,0,0,0,0,0,0,...,0,0,0.000029,0.000029,0.007356,0,0,0,0,0
SRR14806030,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR4457900,1,0,0,0,0,0,0,0,0.00342,0,...,0,0,0,0,0,0,0,0,0,0
SRR4159391,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR14805970,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR11490203,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
Y = pcostestloglasso['group'].apply(lambda x: 0 if x == 'HC' else 1)
Y

SRR22346206    0
SRR14806025    0
SRR14806079    0
SRR13526484    0
SRR14806030    0
              ..
SRR4457900     1
SRR4159391     1
SRR14805970    1
SRR11490203    1
SRR11490200    1
Name: group, Length: 200, dtype: int64

In [87]:
np.linspace(0.1, 5, 20)

array([0.1       , 0.35789474, 0.61578947, 0.87368421, 1.13157895,
       1.38947368, 1.64736842, 1.90526316, 2.16315789, 2.42105263,
       2.67894737, 2.93684211, 3.19473684, 3.45263158, 3.71052632,
       3.96842105, 4.22631579, 4.48421053, 4.74210526, 5.        ])

In [108]:
# smaller Cs = stronger regularization
model = LogisticRegressionCV(
    Cs = np.linspace(0.01, 1, 30),
    cv = 5,
    penalty='l1',
    solver='liblinear')

model.fit(X, Y)

In [109]:
loglassocoefs = np.sort(np.abs(model.coef_))
np.sum(loglassocoefs != 0)

np.int64(66)

In [110]:
model.score(X, Y)

0.9

In [111]:
model.C_

array([0.59034483])

In [113]:
loglassocoefs[loglassocoefs!=0]

array([1.09167206e-03, 7.00186708e-03, 1.10214697e-02, 1.17485827e-02,
       1.29336163e-02, 1.74380774e-02, 1.91146009e-02, 2.06445442e-02,
       3.22982113e-02, 3.25036809e-02, 3.85776633e-02, 3.86565965e-02,
       4.40085667e-02, 4.74012980e-02, 5.11742948e-02, 5.74871930e-02,
       5.92653719e-02, 6.16917010e-02, 6.20675585e-02, 6.49027358e-02,
       7.04982764e-02, 7.18919936e-02, 7.21916139e-02, 7.82333495e-02,
       8.28356670e-02, 8.55529989e-02, 8.95057794e-02, 9.24180804e-02,
       9.45023599e-02, 9.50926419e-02, 1.18174301e-01, 1.32580978e-01,
       1.47692274e-01, 1.49950860e-01, 1.54476435e-01, 1.55527812e-01,
       1.59365623e-01, 1.71761267e-01, 1.71849356e-01, 1.84177022e-01,
       2.00177110e-01, 2.08945947e-01, 2.34747000e-01, 2.40843867e-01,
       2.42182590e-01, 2.50509090e-01, 2.51476018e-01, 2.53474957e-01,
       2.92994355e-01, 3.06159856e-01, 3.20157075e-01, 3.36272145e-01,
       3.47431398e-01, 3.54554554e-01, 3.76838582e-01, 4.07142458e-01,
      