In [5]:
import numpy as np
import pandas as pd
import gzip
import csv

In [None]:
def extract_annot(goa_file, n_skip = 9,out_file="extracted.csv", col_list=[1, 4]):
    """
    Extracts columns from a GOA file and writes them to a new file.

    Parameters:
    - goa_file: The input GOA file name.
    - n_skip: No. of rows to be skipped. Default value 9.
    - out_file: (Optional) The output file name. Defaults to 'extracted.csv'.
    - col_list: List of column IDs to be extracted, e.g., [1, 5]. Defaults to [1, 4].
    """
    with gzip.open(goa_file, 'rt') as f:
        # Skip the first 8 lines
        for _ in range(n_skip):
            next(f)

        # Create a CSV reader object with tab delimiter
        reader = csv.reader(f, delimiter='\t')

        # Open the output file for writing
        with open(out_file, 'w') as outfile:
            # Create a CSV writer object
            writer = csv.writer(outfile, delimiter='\t')
            
            # Iterate over each row in the reader
            for row in reader:
                # Extract the specified columns
                extracted_columns = [row[i] for i in col_list]

                # Write the extracted columns to the output file
                writer.writerow(extracted_columns)


### Extract Entry ID, GO annotation, and ontology type from the gaf files

In [None]:
# Define input and output file paths
t0_out_dir = '/data/rashika/CAFA4/uniprot/goa_2020_Jan_03/'
t0_input_file = t0_out_dir + 'goa_uniprot_all.gaf.gz'
t0_output_file = t0_out_dir + 'extracted_columns.tsv'

In [None]:
t0_col_list = [1,4, 8]
n_skip = 8

#extract_annot(t0_input_file, n_skip, t0_output_file, t0_col_list)

In [None]:
# Define input and output file paths
t1_out_dir = '/data/rashika/CAFA4/uniprot/goa_2024-02-09/'
t1_input_file = t1_out_dir + 'goa_uniprot_all.gaf.gz'
t1_output_file = t1_out_dir + 'extracted_columns.tsv'

In [None]:
t1_col_list  = [1,3,4,6,8]
n_skip = 9
#extract_annot(t1_input_file, n_skip, t1_output_file, t1_col_list)

In [None]:
## Extract annotations from the file used by Shawn
shawn_t0_dir = '/data/yisupeng/sharing/cafa4/'
in_file = shawn_t0_dir + 'goa_uniprot_all_02142020.gaf.gz'
shawn_out_file = '/data/rashika/CAFA4/uniprot/'+ 'shawn_extracted_columns.tsv'

In [None]:
col_list  = [1,4, 6, 8]
n_skip = 8
#extract_annot(in_file, n_skip, shawn_out_file, col_list)

### Map the Extracted annotations to the CAFA targets (by Entry ID)

In [None]:
def map_goa_to_cafa_ids(file_path, mapping_file, primary_id_column, out_path, chunk_size=100000):
    """
    Read a CSV file in chunks, map the primary ID to a mapping file, and keep the rows that can be mapped.

    Parameters:
    - file_path: Path to the CSV file.
    - mapping_file: Path to the mapping file (CSV).
    - primary_id_column: Name of the column containing the primary ID in the mapping file.
    - out_path: Path to the output file.
    - chunk_size: Size of each chunk. Defaults to 100,000 lines.
    """
    # Read the mapping file into a DataFrame
    mapping_df = pd.read_csv(mapping_file, sep = ",", header = 0)
    mapping_df.columns = ["Entry", "CAFA4_ID"]

    # Extract the primary IDs from the mapping file and convert to a set for efficient lookup
    id_set = set(mapping_df["Entry"])

    # Initialize an empty list to store filtered chunk dataframes
    dfs = []

    # Read the CSV file in chunks
    #flag = 0
    for chunk in pd.read_csv(file_path, chunksize=chunk_size, sep = "\t"):
        # Filter the chunk based on whether the primary ID can be found in the mapping file
        filtered_chunk = chunk[chunk.iloc[:,primary_id_column].isin(id_set)]
        filtered_chunk = filtered_chunk.drop_duplicates().copy()
        dfs.append(filtered_chunk)
        #print(chunk.iloc[:,primary_id_column])
        #print(list(id_set)[:10])
        #flag+=1
        #if flag==100:
        #    break

    # Concatenate all the filtered chunk dataframes into a single dataframe
    df = pd.concat(dfs, ignore_index=True)

    # Write the final dataframe to the output file
    df.to_csv(out_path, index=False, sep = "\t")

# Example usage:


In [3]:
Mapping_file = "/data/rashika/CAFA4/CAFA4_gt/Target_Entry_map.csv"

#Mapping_df = pd.read_csv(Mapping_file,  sep = ',', header = None)
#Mapping_df.columns = ["Entry", "CAFA4_ID"]

t1_mapped_ann = "/data/rashika/CAFA4/CAFA4_gt/t1_ann.csv"
t0_mapped_ann = "/data/rashika/CAFA4/CAFA4_gt/t0_ann.csv"
shawn_t0_mapped_ann = "/data/rashika/CAFA4/CAFA4_gt/shawn_t0_ann.csv"

In [None]:
Clara_Entry_IDs = "/data/rashika/CAFA4/CAFA4_gt/Entry.csv"

In [None]:
Clara_Entry_IDs = pd.read_csv(Clara_Entry_IDs,  sep = '\t', header = None)

In [None]:
Mapping_df

In [None]:
# Map t1 annotations
#map_goa_to_cafa_ids(t1_output_file, Mapping_file, 0, t1_mapped_ann )

In [None]:
# Map t0 annotations
#map_goa_to_cafa_ids(t0_output_file, Mapping_file, 0, t0_mapped_ann )

In [None]:
# Map Shawn's annotations
#map_goa_to_cafa_ids(shawn_out_file, Mapping_file, 0, shawn_t0_mapped_ann )

In [1]:
#https://geneontology.org/docs/guide-go-evidence-codes/
#Exp_codes = ['EXP', 'IDA', 'IMP', 'IGI', 'IEP', 'TAS', 'IC' ]
Evidence_codes = ['EXP', 'IDA', 'IPI','IMP', 'IGI', 'IEP', 'TAS', 'IC', 'HTP', 'HDA', 'HMP', 'HGI', 'HEP']


In [6]:
t1 = pd.read_csv(t1_mapped_ann,  sep = '\t', header = None)
t1.columns = ['Entry', 'edge', 'term', "E_code", "aspect"]


# TO do

# Write function to do this

In [7]:
np.unique(t1.loc[:,"E_code"])

array(['EXP', 'HDA', 'HEP', 'HGI', 'HMP', 'HTP', 'IBA', 'IC', 'IDA',
       'IEA', 'IEP', 'IGC', 'IGI', 'IKR', 'IMP', 'IPI', 'ISA', 'ISM',
       'ISO', 'ISS', 'NAS', 'ND', 'RCA', 'TAS'], dtype=object)

In [9]:
np.unique(t1.edge)

array(['NOT|acts_upstream_of', 'NOT|acts_upstream_of_or_within',
       'NOT|acts_upstream_of_or_within_negative_effect',
       'NOT|acts_upstream_of_or_within_positive_effect',
       'NOT|colocalizes_with', 'NOT|contributes_to', 'NOT|enables',
       'NOT|involved_in', 'NOT|is_active_in', 'NOT|located_in',
       'NOT|part_of', 'acts_upstream_of',
       'acts_upstream_of_negative_effect', 'acts_upstream_of_or_within',
       'acts_upstream_of_or_within_negative_effect',
       'acts_upstream_of_or_within_positive_effect',
       'acts_upstream_of_positive_effect', 'colocalizes_with',
       'contributes_to', 'enables', 'involved_in', 'is_active_in',
       'located_in', 'part_of'], dtype=object)

In [12]:
sum(t1.edge.apply(lambda x:"NOT" in x))/len(t1.edge)

0.0024765697327871557

In [None]:
t1 = t1[t1.loc[:,"E_code"].isin(Evidence_codes)].copy() 

In [None]:
shawn_t0 = pd.read_csv(shawn_t0_mapped_ann,  sep = '\t', header = None)
shawn_t0.columns = ['Entry', 'term', 'E_code','aspect']
shawn_t0 = shawn_t0[shawn_t0.loc[:,"E_code"].isin(Evidence_codes)].copy() 

In [None]:
len(np.unique(shawn_t0['Entry']))

In [None]:
len(np.unique(t1['Entry']))

In [None]:
t1_mapped = pd.merge(t1, Mapping_df, on='Entry', how='inner')
t1_mapped
t1_mapped = t1_mapped.loc[:, ["CAFA4_ID", "term", "aspect", "edge"]]
t1_mapped.to_csv('/data/rashika/CAFA4/CAFA4_gt/t1_mapped.csv', sep = "\t", index=False, header = False)

In [None]:
t0_mapped = pd.merge(shawn_t0, Mapping_df, on='Entry', how='inner')
t0_mapped
t0_mapped = t0_mapped.loc[:, ["CAFA4_ID", "term", "aspect"]]
t0_mapped.to_csv('/data/rashika/CAFA4/CAFA4_gt/t0_mapped.csv', sep = "\t",index=False, header = False)