### Extracting metadata from the ARCHS4 data
The authors provide a nice python package for this, so we will use it

In [1]:
import archs4py as a4
import numpy as np
import pandas as pd
import re

In [2]:
# path to file
file = "/rds/project/jmmh2/rds-jmmh2-public_databases/ARCHS4/human_gene_v2.3.h5"

In [10]:
# extract all samples that have "macrophage" and "sex" somewhere in their characteristics field
macrophage_data = a4.meta.meta(
    file, "macrophage", meta_fields=["series_id", "characteristics_ch1"]
)
pattern = re.compile(r"(?i)\b(sex)\b")
macrophage_data = macrophage_data[
    macrophage_data["characteristics_ch1"].str.contains(pattern, na=False)
]

100%|██████████| 2/2 [00:00<00:00,  2.35it/s]
  macrophage_data["characteristics_ch1"].str.contains(pattern, na=False)


In [3]:
macrophage_data = a4.meta.meta(
    file, "macrophage", meta_fields=["series_id", "characteristics_ch1"]
)

100%|██████████| 2/2 [00:00<00:00,  2.54it/s]


In [6]:
pattern = re.compile(r"(?i)\b(sex)\b")
macrophage_data = macrophage_data[
    macrophage_data["characteristics_ch1"].str.contains(pattern, na=False)
]

  macrophage_data["characteristics_ch1"].str.contains(pattern, na=False)


In [14]:
# extract the counts for the above samples
sample_counts = a4.data.samples(file, macrophage_data.index.to_list())

100%|██████████| 1128/1128 [00:14<00:00, 76.19it/s]


In [15]:
# filter the counts a bit. will will probably end up using like 10 genes only anyway
sample_counts = sample_counts[~sample_counts.index.str.contains("ENSG000")]
sample_counts = sample_counts.loc[(sample_counts.sum(axis=1) > 100), :]
sample_counts = sample_counts.loc[sample_counts.mean(axis=1) > 10, :]

In [17]:
# extract the actual sex from the metadata
pattern = r"SEX: (?P<SEX>[^,]+)"

# Apply the regex to the 'characteristics_ch1' column
macrophage_data["characteristics_ch1"].str.extract(pattern)

Unnamed: 0,SEX
GSM2097611,MALE
GSM2097612,FEMALE
GSM2097613,FEMALE
GSM2097614,MALE
GSM2097615,MALE
...,...
GSM7592798,M
GSM7592799,M
GSM7592801,F
GSM7592802,M


In [22]:
# extract the actual sex from the metadata
pattern = r"SEX: (?P<SEX>[^,]+)"

# Apply the regex to the 'characteristics_ch1' column

macrophage_data["sex"] = macrophage_data["characteristics_ch1"].str.extract(pattern)
macrophage_data = macrophage_data.loc[:, ["series_id", "sex"]]

In [25]:
macrophage_data["sampleID"] = macrophage_data.index
sample_counts["gene"] = sample_counts.index

# save to disk
macrophage_data.to_csv(
    "/rds/user/nh608/hpc-work/oxLDL/sex_inference/mdata_macrophage.tsv",
    sep="\t",
    index=False,
)
sample_counts.to_csv(
    "/rds/user/nh608/hpc-work/oxLDL/sex_inference/counts_macrophage.tsv",
    sep="\t",
    index=False,
)