In [1]:
import gzip
import pandas as pd
import re

In [2]:
soft_file = "/home/mlopez/Desktop/alzheimer/data/GSE138260/GSE138260_family.soft.gz"

In [3]:
samples = []
sample = {}

with gzip.open(soft_file, 'rt', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if line.startswith("^SAMPLE"):
            if sample:
                samples.append(sample)
                sample = {}
            sample["Accession"] = line.split(" = ")[1]
        elif line.startswith("!Sample_title"):
            full_title = line.split(" = ")[1]
            sample["Title"] = full_title.split(",")[0].strip()
        elif line.startswith("!Sample_characteristics_ch1"):
            value = line.split(" = ")[1]
            if value.lower().startswith("sex:"):
                sample["Sex"] = value.split(":")[1].strip()
            elif value.lower().startswith("age:"):
                sample["Age"] = value.split(":")[1].strip()
            elif value.lower().startswith("disease status:"):
                sample["DiseaseStatus"] = value.split(":")[1].strip()

    if sample:
        samples.append(sample)

# Crear DataFrame
df = pd.DataFrame(samples)

# Extraer región cerebral (todo antes de " AD" o " control")
df["BrainRegion"] = df["Title"].apply(lambda x: re.split(r" AD| control", x)[0].strip())

# Extraer grupo (AD o control)
df["Group"] = df["Title"].apply(lambda x: "AD" if "AD" in x else "CONTROL")

# Extraer el ID de la muestra entre paréntesis
df["SampleID"] = df["Title"].apply(lambda x: re.search(r"\((.*?)\)", x).group(1) if re.search(r"\((.*?)\)", x) else None)

# Eliminar columna 'Title' si ya no es necesaria
df.drop(columns=["Title"], inplace=True)

# Guardar el DataFrame en un archivo CSV
df.to_csv("/home/mlopez/Desktop/alzheimer/results/GSE138260/GSE138260_metadata.csv", index=False)