### Extracting metadata from the ARCHS4 data
The authors provide a nice python package for this, so we will use it

In [1]:
import archs4py as a4
import numpy as np
import pandas as pd
import re

In [2]:
# path to file
file = "/rds/project/jmmh2/rds-jmmh2-public_databases/ARCHS4/human_gene_v2.3.h5"

In [3]:
# extract all samples that have "macrophage" and "sex" somewhere in their characteristics field
macrophage_data = a4.meta.meta(
    file, "whole blood", meta_fields=["series_id", "characteristics_ch1"]
)
# make sure that there is information about the sex in the data
pattern = re.compile(r"(?i)\b(sex)\b")
macrophage_data = macrophage_data[
    macrophage_data["characteristics_ch1"].str.contains(pattern, na=False)
]

100%|██████████| 2/2 [00:00<00:00,  2.52it/s]
  macrophage_data["characteristics_ch1"].str.contains(pattern, na=False)


In [4]:
# extract the counts for the above samples
sample_counts = a4.data.samples(file, macrophage_data.index.to_list())

100%|██████████| 6429/6429 [00:41<00:00, 154.59it/s]


In [6]:
# filter the counts a bit. will will probably end up using like 10 genes only anyway
sample_counts = sample_counts[~sample_counts.index.str.contains("ENSG000")]
sample_counts = sample_counts.loc[(sample_counts.sum(axis=1) > 100), :]
sample_counts = sample_counts.loc[sample_counts.mean(axis=1) > 10, :]

In [8]:
# extract the actual sex from the metadata
pattern = r"SEX: (?P<SEX>[^,]+)"

# Apply the regex to the 'characterisbtics_ch1' column

macrophage_data["sex"] = macrophage_data["characteristics_ch1"].str.extract(pattern)
macrophage_data = macrophage_data.loc[:, ["series_id", "sex"]]

In [11]:
macrophage_data["sampleID"] = macrophage_data.index
sample_counts["gene"] = sample_counts.index

# save to disk
macrophage_data.to_csv(
    "/rds/user/nh608/hpc-work/oxLDL/sex_inference/mdata_whole_blood.tsv",
    sep="\t",
    index=False,
)
sample_counts.to_csv(
    "/rds/user/nh608/hpc-work/oxLDL/sex_inference/counts_whole_blood.tsv.gz",
    sep="\t",
    index=False,
)