In [218]:
import os
import sys
import subprocess
import csv
import numpy as np
import pandas as pd
import glob

In [111]:
# Function to construct GEO URL
def construct_geo_url(geo_code):
    geo_prefix = geo_code[:6]  # First 6 characters (e.g., GSE224)
    file_name = f"{geo_code}_family.soft"
    geo_url = f"https://ftp.ncbi.nlm.nih.gov/geo/series/{geo_prefix}nnn/{geo_code}/soft/{file_name}.gz"
    return geo_url, file_name


In [165]:

def add_general_rows2csv(soft_file, head_l, row_number, samples_amount, output_csv):
    
    if head_l == "^DATABASE":
        head_n = "!Database"
    elif head_l == "^SERIES":
        head_n = "!Series"
    elif head_l == "^PLATFORM":
        head_n = "!Platform"
    else:
        head_n = None  # or some default value
        
    with open(output_csv, 'a') as cf:
        field_name = soft_file[row_number].split("=")[0].strip()
        field_name = field_name[1:]
        value = soft_file[row_number].split("=")[1].strip()
        # Create the row: first the field name, then the value repeated `samples_amount` times
        row = field_name + "\t" + "\t".join([value] * samples_amount) + "\n"
        cf.write(row)
        #print("row:\n", row)
        row_number += 1
        while row_number < len(soft_file) and soft_file[row_number].startswith(head_n):
            field_name = soft_file[row_number].split("=")[0].strip()
            field_name = field_name[1:]
            if (field_name == "Series_sample_id"):
                row_number += 1
                continue
            value = soft_file[row_number].split("=")[1].strip()  
            row = field_name + "\t" + "\t".join([value] * samples_amount)+"\n"
            #print("row:\n", row)
            cf.write(row)
            row_number += 1
            
    return row_number


In [221]:
def add_samples_rows2csv(soft_file, row_number, samples_amount, output_csv):
    
    samples_dict = {}
    while row_number < len(soft_file) and (soft_file[row_number].startswith("!Sample") or soft_file[row_number].startswith("^SAMPLE")):
    #while i < len(soft_file):
        # Split the line at the first '=' and strip any surrounding whitespace
        key, value = soft_file[row_number].split('=', 1)
        key = key.strip()[1:]
        value = value.strip()
        
        # Initialize the key in the dictionary if it doesn't exist
        if key not in samples_dict:
            samples_dict[key] = []
            
        if (len(samples_dict["SAMPLE"]) == len(samples_dict[key])) and (len(samples_dict["SAMPLE"]) > 0) and (key != "SAMPLE"):
            #print (samples_dict)
            samples_dict[key][-1] = samples_dict[key][-1] + ",, " + value
        else:
            # Add the value to the list corresponding to the key
            samples_dict[key].append(value)
        
        # Move to the next line
        row_number += 1
    
    with open(output_csv, 'a', newline='') as csvfile:
        # Create a CSV writer with tab delimiter
        csvwriter = csv.writer(csvfile, delimiter='\t')

        # Write each key and its corresponding values as a row
        for key, values in samples_dict.items():
            # Create a row with the key and its values
            row = [key] + values
            csvwriter.writerow(row)  
    return row_number

   
    

In [216]:
def transpose_csv(csv_file):
    # Read the CSV file
    table = pd.read_csv(csv_file, sep = "\t")
    #print("Table:\n",table)
    
    table_t = np.transpose(table)
    #print("Table_T:\n",table_t)
    
    # Write the transposed data to a new CSV file
    table_t.to_csv(csv_file, sep = "\t", header = True, index = False)
    
#default_input_file = "all_GSE224028_samples.csv"
#transpose_csv(default_input_file)

In [227]:
if __name__ == "__main__":
    
    if glob.glob("*.soft"):
        !rm *.soft
    
    # Set default input file path
    default_input_file = "/home/bcrlab/igguest/home/bcrlab/igguest/naama/OTS_Datasets/copy_output_ProjectsCodesAndCounts.txt"
    #input_file = sys.argv[1] if len(sys.argv) > 1 else default_input_file
    input_file = default_input_file #input()
    
    # Read the input file, skipping the header
    with open(input_file, 'r') as infile:
        next(infile)  # Skip the header line
        for line in infile:
            # Extract the GEO code from the 3rd column
            columns = line.strip().split()
            geo_code = columns[2] if len(columns) > 1 else None
            
            if (geo_code!="Null"):
                print(geo_code)
                samples_amount_inOTS = columns[3]
                # Construct the GEO URL and file name
                geo_url, file_name = construct_geo_url(geo_code)
                print(f"Downloading {geo_url}")

                # Download the file
                subprocess.run(["wget", geo_url], check=True)

                # Decompress the file
                subprocess.run(["gunzip", f"{file_name}.gz"], check=True)

                # Output CSV file
                output_csv = f"{geo_code}_collected_metadata.csv"

                # Initialize the CSV header
                with open(output_csv, 'w') as csvfile:
                    soft_file_open = open(file_name , 'r')
                    soft_file = soft_file_open.readlines()
                    
                    samples_amount =  sum(line.count('^SAMPLE') for line in soft_file)
                    print("\nSamples_amount",samples_amount, "\n______________________________")
                    row_number = 0
                    while (row_number < len(soft_file)):
                        line = soft_file[row_number]
                        head_l = line.split("=")[0].strip()
                        #print(head_l)
                        if ((head_l == "^DATABASE") | (head_l == "^SERIES") | (head_l == "^PLATFORM") ):
                            #print("IF RECOGNIZE ",head_l)
                            row_number = add_general_rows2csv(soft_file, head_l,row_number, samples_amount, output_csv)
                            
                        elif (head_l =="^SAMPLE"):
                            #print("IF RECOGNIZE ",head_l)
                            row_number = add_samples_rows2csv(soft_file,row_number, samples_amount, output_csv)

                        else:
                            print("The follow line is an unknown line in the soft file:\n", soft_file[row_number])
                            row_number+=1        
                
                    transpose_csv(output_csv)
                    os.remove(file_name)
                    
                

GSE205589
Downloading https://ftp.ncbi.nlm.nih.gov/geo/series/GSE205nnn/GSE205589/soft/GSE205589_family.soft.gz

Samples_amount 23 
______________________________
GSE213486
Downloading https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213486/soft/GSE213486_family.soft.gz

Samples_amount 9 
______________________________
GSE114724
Downloading https://ftp.ncbi.nlm.nih.gov/geo/series/GSE114nnn/GSE114724/soft/GSE114724_family.soft.gz

Samples_amount 10 
______________________________
GSE162086
Downloading https://ftp.ncbi.nlm.nih.gov/geo/series/GSE162nnn/GSE162086/soft/GSE162086_family.soft.gz

Samples_amount 40 
______________________________
GSE230227
Downloading https://ftp.ncbi.nlm.nih.gov/geo/series/GSE230nnn/GSE230227/soft/GSE230227_family.soft.gz

Samples_amount 33 
______________________________
GSE121637
Downloading https://ftp.ncbi.nlm.nih.gov/geo/series/GSE121nnn/GSE121637/soft/GSE121637_family.soft.gz

Samples_amount 6 
______________________________
GSE176201
Downloading ht

IndexError: list index out of range