In [2]:
import wget
import os
import pandas as pd
import GEOparse as geo

def load_file_as_df(filename):
    if 'matrix' in filename:
        return geo.get_GEO(geo='GSE40279', destdir='./')

    url = f"https://ftp.ncbi.nlm.nih.gov/geo/series/GSE40nnn/GSE40279/suppl/{filename}"

    if not os.path.exists(filename):
        wget.download(url)

    df = pd.read_csv(filename, sep="\t", compression="gzip", index_col=0)
    return df

In [3]:
matrix = load_file_as_df("GSE40279_series_matrix.txt.gz")

14-Dec-2022 21:25:29 DEBUG utils - Directory ./ already exists. Skipping.
14-Dec-2022 21:25:29 INFO GEOparse - File already exist: using local version.
14-Dec-2022 21:25:29 INFO GEOparse - Parsing ./GSE40279_family.soft.gz: 
14-Dec-2022 21:25:29 DEBUG GEOparse - DATABASE: GeoMiame
14-Dec-2022 21:25:29 DEBUG GEOparse - SERIES: GSE40279
14-Dec-2022 21:25:29 DEBUG GEOparse - PLATFORM: GPL13534
  return read_csv(StringIO(data), index_col=None, sep="\t")
14-Dec-2022 21:25:35 DEBUG GEOparse - SAMPLE: GSM989827
14-Dec-2022 21:25:36 DEBUG GEOparse - SAMPLE: GSM989828
14-Dec-2022 21:25:37 DEBUG GEOparse - SAMPLE: GSM989829
14-Dec-2022 21:25:38 DEBUG GEOparse - SAMPLE: GSM989830
14-Dec-2022 21:25:38 DEBUG GEOparse - SAMPLE: GSM989831
14-Dec-2022 21:25:39 DEBUG GEOparse - SAMPLE: GSM989832
14-Dec-2022 21:25:40 DEBUG GEOparse - SAMPLE: GSM989833
14-Dec-2022 21:25:41 DEBUG GEOparse - SAMPLE: GSM989834
14-Dec-2022 21:25:42 DEBUG GEOparse - SAMPLE: GSM989835
14-Dec-2022 21:25:43 DEBUG GEOparse - SAMP

In [25]:
for gsm_name, gsm in matrix.gsms.items():
    print("Name: ", gsm_name)
    print("Metadata:",)
    for key, value in gsm.metadata.items():
        print(" - %s : %s" % (key, ", ".join(value)))
    print ("Table data:",)
    print (gsm.table.head())
    break

Name:  GSM989827
Metadata:
 - title : age 67y 1001
 - geo_accession : GSM989827
 - status : Public on Nov 21 2012
 - submission_date : Aug 21 2012
 - last_update_date : Nov 21 2012
 - type : genomic
 - channel_count : 1
 - source_name_ch1 : X1001
 - organism_ch1 : Homo sapiens
 - taxid_ch1 : 9606
 - characteristics_ch1 : age (y): 67, source: UCSD, plate: 1, gender: F, ethnicity: Caucasian - European, tissue: whole blood
 - molecule_ch1 : genomic DNA
 - extract_protocol_ch1 : genomic DNA was extracted and purified Qiagen FlexiGene DNA kit
 - label_ch1 : Cy5 and Cy3
 - label_protocol_ch1 : Standard Illumina Protocol
 - hyb_protocol : bisulphite converted DNA was amplified, fragmented and hybridised to Illumina Infinium Human Methylation450 Beadchip using standard Illumina protocol
 - scan_protocol : Arrays were imaged using BeadArray Reader using standard recommended Illumina scanner setting
 - description : whole blood
 - data_processing : BeadStudio software v3.2
 - platform_id : GPL13

In [15]:
# transform matrix into a dataframe containing table data, age, gender and ethnicity
gsm_dict = {'name': [], 'age': [], 'gender': [], 'ethnicity': []}
for gsm_name, gsm in matrix.gsms.items():
    gsm_dict['name'].append(gsm.metadata['source_name_ch1'][0])
    gsm_dict['age'].append(int(gsm.metadata['characteristics_ch1'][0].split(":")[1]))
    gsm_dict['gender'].append(gsm.metadata['characteristics_ch1'][3].split(":")[1])
    gsm_dict['ethnicity'].append(gsm.metadata['characteristics_ch1'][4].split(":")[1][1])
df = pd.DataFrame(gsm_dict)
df.head()

Unnamed: 0,name,age,gender,ethnicity
0,X1001,67,F,C
1,X1002,89,F,C
2,X1003,66,F,C
3,X1004,64,F,C
4,X1005,62,F,C


In [17]:
# save the dataframe to a csv file
df.to_csv("hannum_meta.csv")