In [2]:
import pandas as pd
import ast
import csv


In [3]:
# Read CSV files
df1 = pd.read_csv('screening.csv')#screening assay data
df2 = pd.read_csv('confirmatory.csv')#confirmatory assay data 

# Concatenate both DataFrames and save to csv 
combined_df = pd.concat([df1, df2])
combined_df.to_csv('combined.csv', index=False)

# cleaning data 

In [13]:
# Function to check if each 'Potential Target' list is valid
def is_valid_target_list(target_list_str):
    """
    validate whether the input string represents a list of lists of strings 
    
    target_list_str: string of a list  
    
    return: list of boolean values 
    """
    try:
        target_list = ast.literal_eval(target_list_str)  # Try to evaluate the input string as a Python literal using ast.literal_eval converting the string representation of a list into an actual list.
        if not isinstance(target_list, list):  # Check if the result of the evaluation is a list.
            return False  # If it's not a list, return False.
    except (ValueError, SyntaxError):  # If there's a ValueError or SyntaxError while trying to evaluate the string,return False. These errors might occur if the string is not a valid Python literal.
        return False
    return True  # If everything went well, return True. This means the input string represents a valid list of lists.

def transform_string(s):
    """
    Function to transform a string to uppercase and remove spaces and/or brackets 
    s: string to be transformed 
    return: transformed string 
    """
    
    return s.upper().replace(" ", "").replace('[','').replace(']','')


In [73]:
invalid_target_rows = combined_df[combined_df['Potential Target'].apply(is_valid_target_list)] #find the rows of screening and confimratory data that have errors in the poteintial target list 

# Save the invalid target rows to a new CSV file to be able to manually fix 
invalid_target_rows.to_csv('invalid_target_rows.csv', index=False) 


In [None]:
#after fixing previus invalid rows, each string can be cleaned 

with open('combined.csv', newline='') as csvfile: 
    reader = csv.DictReader(csvfile) # Create a CSV reader object for the opened file.
    data = [row for row in reader]   # Read the CSV data and store it as a list of dictionaries.

for row in data: # Iterate through each row in the data read from the CSV.
    new_row=[] # Create an empty list to store transformed values.
    target_list=list(row['Potential Target'].split(','))# create a list of each comma separated string in the potential Target list.
    for string in target_list:
        string= transform_string(string)# Apply the 'transform_string' function to each string.
        new_row.append(string)# Add the transformed string to the new list
    row['Potential Target']=new_row # Replace the original 'Potential Target' list with the transformed list of strings.

with open('edited_target_data.csv', 'w', newline='') as csvfile: #create new csv file for transformed data including other columns that have not been changed
    fieldnames = ['AID', 'Potential Target', 'source_name','type']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

# Grouping Assays based on Target Names and Source

In [74]:

def find_common_strings(string1, string2):
    """function to find common strings between two lists of strings
    string 1: list of strings that is being compared 
    string2: list of strings that is being compared
    
    return: a list of common strings from both lists 
    """
    return list(set(string1) & set(string2))

# Group data based on column 3 (Age) values
grouped_data = {} #empty dictionary 
for row in data: #iterate through data of assays 
    source = row['source_name'] #find the source name
    if source not in grouped_data: #check to see if already in new dictionary, if not create a new key value pair with key:source value:row of data 
        grouped_data[source] = [] 
    grouped_data[source].append(row)

    
result_data = [] #empty list for grouped data to go 
for source, rows in grouped_data.items(): #iterate through dictionary
    for i, row1 in enumerate(rows): 
        for j, row2 in enumerate(rows[i + 1:], start=i + 1): #iterate through each value in dictionary to get pairs of rows to compare against
            common_names = find_common_strings(row1['Potential Target'].split(','), row2['Potential Target'].split(',')) # separate the potential target column values by commas and find the common target names.
            if len(common_names) >= 1: #if there is common strings found, the rows that include the shared strings are added to new list of data
                result_data.append(row1) 
                result_data.append(row2)
    
# Remove duplicate records from the result_data list
unique_result_data = [] #list for data that has no duplicates 
seen_ids = [] #list of aids that are in the data
for row in result_data:
    row_id = row['AID'] 
    if row_id not in seen_ids: #if aid has not already been added to list, add row of data to list and add aid to list
        unique_result_data.append(row)
        seen_ids.append(row_id)

#write new grouped and cleaned data to new csv file 
with open('final_grouped_data_edited.csv', 'w', newline='') as csvfile:
    fieldnames = ['AID', 'Potential Target', 'source_name','type']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(unique_result_data)



# find data that was removed at this stage of filters 

In [75]:
# Find the IDs in the original data that are not in the final unique result data
all_ids = {row['AID'] for row in data}
final_result_ids = {row['AID'] for row in unique_result_data}
not_in_final_result_ids = all_ids - final_result_ids

# Filter out the rows not in the final unique result data by using the lists of aids created above
not_in_final_result_data = [row for row in data if row['AID'] in not_in_final_result_ids]

# Write the data with IDs not in the final result to a new CSV file
with open('not_in_final_result_data2.csv', 'w', newline='') as csvfile:
    fieldnames = ['AID', 'Potential Target', 'source_name','type']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(not_in_final_result_data)