In [282]:
import numpy as np
import pandas as pd
import gzip
import csv
import io
import argparse

import subprocess
import multiprocessing
import time

In [308]:
"""
    Read a CSV file in chunks, map the primary ID to a mapping file, and keep the rows that can be mapped.

    Parameters:
    - file_path: Path to the CSV file.
    - mapping_file: Path to the mapping file (CSV).
    - primary_id_column: Index of the column containing the primary ID in the mapping file.
    - out_path: Path to the output file.
    - chunk_size: Size of each chunk. Defaults to 100,000 lines.
    """
def map_goa_to_cafa_ids(file_path, mapping_file, primary_id_column, out_path, chunk_size=100000):
    # Read the mapping file into a DataFrame
    mapping_df = pd.read_csv(mapping_file, sep = ",", header = 0)
    mapping_df.columns = ["DB Object ID", "CAFA4_ID"]

    # Extract the primary IDs from the mapping file and convert to a set for efficient lookup
    id_set = set(mapping_df["DB Object ID"])

    # Initialize an empty list to store filtered chunk dataframes
    dfs = []

    # Read the CSV file in chunks
    #flag = 0
    for chunk in pd.read_csv(file_path, chunksize=chunk_size, sep = "\t"):
        # Filter the chunk based on whether the primary ID can be found in the mapping file
        filtered_chunk = chunk[chunk.iloc[:,primary_id_column].isin(id_set)]
        filtered_chunk = filtered_chunk.drop_duplicates().copy()
        dfs.append(filtered_chunk)

    # Concatenate all the filtered chunk dataframes into a single dataframe
    df = pd.concat(dfs, ignore_index=True)
    
    df_mapped = pd.merge(df, mapping_df, on='DB Object ID', how='inner')
    print("Rows in the mapped file : ", len(df_mapped))
    
    df_mapped = df_mapped.loc[:, ["CAFA4_ID", "GO ID", "Aspect"]].copy()

    # Write the final dataframe to the output file
    df_mapped.to_csv(out_path, index=False, sep = "\t")



In [309]:
def get_preprocess_cmd(gaf_path, out_path):
    cmd = [
    "python3",                 # Command to execute Python 3
    "preprocess_gaf.py",       # Script to run
    t0_gaf_file,  # Path to input file
    "--highTP",
    "--out_path", out_path,        # Output path parameter
    #"--evidence_codes", "EXP", "IDA",   # Evidence codes parameter
    #"--extract_col_list", "DB Object ID", "Qualifier"  # Extract column list parameter
]
    return cmd

def run_process(command):
    subprocess.run(command)
    
def make_bl_lists():

if __name__ == "__main__":
    # Define commands and log file names
    work_dir = "/data/rashika/CAFA4/"
    
    #t0_gaf_file = work_dir + "uniprot/raw_goa/sample_t0.gz"
    t0_gaf_file = work_dir + "uniprot/raw_goa/t0/goa_uniprot_all.gaf.195.gz"
    t0_processed = work_dir + "extracted_goa/t0_preprocessed.csv"
    log_t0 =  work_dir + "log/log_preprocess_t0.txt"
    
    t1_gaf_file = work_dir + "uniprot/raw_goa/t1/goa_uniprot_all.gaf.gz"
    t1_processed = work_dir + "extracted_goa/t1_preprocessed.csv"
    log_t1 = work_dir + "log/log_preprocess_t1.txt"
    
    
    cmd_preprocess_t0 = get_preprocess_cmd(t0_gaf_file, t0_processed)
    cmd_preprocess_t1 = get_preprocess_cmd(t1_gaf_file, t1_processed)
    
    # Preprocess both files, UNCOMMENT when needed
    #run_process(cmd_preprocess_t0)
    #run_process(cmd_preprocess_t1)
    
    
    # Map the IDs of the processed 
    mapping_file = "/data/rashika/CAFA4/CAFA4_gt/Target_Entry_map.csv"
    t0_mapped = work_dir + "mapped/t0_mapped.csv"
    t1_mapped = work_dir + "mapped/t1_mapped.csv"
    map_goa_to_cafa_ids(t0_processed, mapping_file, 0, t0_mapped, chunk_size=100000)
    #map_goa_to_cafa_ids(t1_processed, mapping_file, 0, t1_mapped, chunk_size=100000)
    
    

Rows in the mapped file :  449581
