In [None]:
!pip install GEOparse

In [None]:
import GEOparse

# Provide the GEO accession number (for example, GSE12345)
gse = GEOparse.get_GEO(geo="GSE12345", destdir=".", how="full")



In [None]:
# Check the metadata of the dataset
print(gse.metadata)




In [None]:
# Get the sample names
print(gse.metadata['sample_id'])

In [None]:
gsm = GEOparse.get_GEO(geo='GSM6637162', destdir=".", how="full")
print(gsm.metadata)  # View sample metadata

In [None]:
with open("liver_cancer_geo.txt", "r") as file:
    content = file.read()  # Reads the entire file

In [None]:
import re
# Regular expression to match sections
pattern = r'(\d+)\.\s(.*?)\n\(Submitter supplied\)(.*?)Organism:\s*(.*?)\nType:\s*(.*?)\nPlatform(?:s)?:\s*(.*?)\nFTP download:\s*(.*?)\nSeries\s*Accession:\s*(.*?)\s*ID:\s*(\d+)'

# Parse the text into sections
matches = re.findall(pattern, content, re.DOTALL)

# Structure to hold the parsed data
datasets = []

# Extract and store each section's data in a dictionary
for match in matches:
    entry = {
        'index': match[0],
        'title': match[1].strip(),
        'description': match[2].strip(),
        'organism': match[3].strip(),
        'type': match[4].strip(),
        'platforms': match[5].strip(),
        'ftp_download': match[6].strip(),
        'accession': match[7].strip(),
        'id': match[8].strip()
    }
    datasets.append(entry)

In [None]:
def create_record(raw_geo):
    new_data = []
    acc = raw_geo['accession']
    gse = GEOparse.get_GEO(geo=acc, destdir="geo/", how="full")
    title = gse.metadata.get('title','')
    summary = gse.metadata.get('summary','')
    design = gse.metadata.get('design','')
    samples = gse.metadata.get('sample_id','')
    if len(samples) == 0:
        return new_data
    for sample in samples:
        gsm = GEOparse.get_GEO(geo=sample, destdir="geo/"+acc+"/", how="full")
        characteristics = gsm.metadata.get('characteristics_ch1','')
        # Initialize an empty dictionary
        attributes_dict = {}
        attributes_dict['title'] = title
        attributes_dict['summary'] = summary
        attributes_dict['design'] = design
        attributes_dict['gse'] = acc
        attributes_dict['gsm'] = gsm.metadata['geo_accession']
        if characteristics:
            # Loop through the list and process each element
            for item in characteristics:
                try:
                    # Split the string into key and value
                    key, value = item.split(":")
                    # Strip any extra whitespace and add to dictionary
                    attributes_dict[key.strip()] = value.strip()
                except ValueError:
                    # Handle cases where splitting fails (e.g., no colon in the string)
                    attributes_dict[item] = ''
                    print(f"Skipping invalid entry: {item}")
        new_data.append(attributes_dict)
    return new_data
       
    

In [None]:
ans = create_record(datasets[0])

In [None]:
for a in ans:
    print(a.get('tissue',''))
    print(a.get('summary',''))

In [None]:
final_data = []
for d in datasets:
    if 'liver cancer' in d['title']:
        ans = create_record(d)
        final_data.extend(ans)
        if len(final_data) > 1000:
            break
        

In [None]:
print(len(final_data))

In [None]:
import random
random_samples = random.sample(final_data, 800)

In [None]:
print(len(random_samples))

In [None]:
import json
# Write the list of dictionaries to a JSON file
with open('ovarian_cancer_geo.json', 'w') as json_file:
    json.dump(random_samples, json_file, indent=4)  # 'indent=4' adds readability to the file