Author: Ningxin Kang (nik010@ucsd.edu)

Last Update: 10/13/2022

# Dataset Cleaning

In [2]:
import pandas as pd
import numpy as np

In [None]:
def merging_info(dir_file_GFP, dir_file_DAPI, 
                 area_filter_low, area_filter_high, 
                 dir_GFP_output, dir_DAPI_output):
    '''Merge individual datasets into a large dataframe.

    :dir_file_GFP: Directory of GFP files
    :dir_file_DAPI: Directory of DAPI files
    :area_filter_low: Low threshold of Area
    :area_filter_high: High threshold of Area
    :dir_GFP_output: Directory of GFP dataframe output
    :dir_DAPI_output: Directory of DAPI dataframe output

    :returns: NA 
    :rtype: NA
    '''  
    
    #########################
    # Importing the dataset #
    #########################
    # Import the data
    df_GFP = pd.read_csv(dir_file_GFP)
    df_DAPI = pd.read_csv(dir_file_DAPI,sep = ",")

    # Change column name so they two dataframes matches
    df_DAPI.rename({'Slice': 'Label'}, axis=1, inplace=True)

    ########################
    # Formating the df_GFP #
    ########################
    # Create a column recording the treatment of each sample
    df_GFP['category'] = [re.search('^((?:[^_]*\_){3})([^_]*)',df_GFP['Label'][i]).group(2) for i in range(df_GFP.shape[0])]

    # Extract the id of mouse from column 'Label'
    df_GFP['mouse_id'] = [re.search('^((?:[^_]*\_){1})([^_]*)',df_GFP['Label'][i]).group(2) for i in range(df_GFP.shape[0])]

    # Extract the if of picture from column 'Label'
    df_GFP['picture_id'] = [re.search('^((?:[^_]*\_){9})([^_.]*)',df_GFP['Label'][i]).group(2)
                            for i in range(df_GFP.shape[0])]
    df_GFP['picture_id'] = df_GFP['mouse_id'] + '_' + df_GFP['category'] + '_' + df_GFP['picture_id']      # combine mouth id and picture id

    # Filter the aggregates by area
    df_GFP = df_GFP[df_GFP.Area > area_filter_low]
    df_GFP = df_GFP[df_GFP.Area < area_filter_high]
    
    #########################
    # Formating the df_DAPI #
    #########################
    # Create a column recording the treatment of each sample
    df_DAPI['category'] = [re.search('^((?:[^_]*\_){3})([^_]*)',df_DAPI['Label'][i]).group(2) for i in range(df_DAPI.shape[0])]
    # Extract the id of mouse from column 'Label'
    df_DAPI['mouse_id'] = [re.search('^((?:[^_]*\_){1})([^_]*)',df_DAPI['Label'][i]).group(2) for i in range(df_DAPI.shape[0])]
    # Extract the if of picture from column 'Label'
    df_DAPI['picture_id'] = [re.search('^((?:[^_]*\_){9})([^_.]*)',df_DAPI['Label'][i]).group(2)
                            for i in range(df_DAPI.shape[0])]
    df_DAPI['picture_id'] = df_DAPI['mouse_id'] + '_' + df_DAPI['category'] + '_' + df_DAPI['picture_id']      # combine mouth id and picture id
    df_DAPI = df_DAPI.drop(columns=['mouse_id',"Label","Mode","IntDen","category"])
    
    
    #############
    # Exporting #
    #############
    df_GFP.to_csv(dir_GFP_output,sep = ",",index = False)
    df_DAPI.to_csv(dir_DAPI_output,sep = ",",index = False)
    
    return("Finish Cleaning!!")

In [86]:
dir_file_GFP = "/Users/sylvia618/Desktop/Chen Lab/hek cells/input/BHBDiet_allfiles/filepath.xls"
dir_file_DAPI = "/Users/sylvia618/Desktop/Chen Lab/hek cells/input/DAPI/filepath.xls"
dir_GFP_output = "/Users/sylvia618/Desktop/Chen Lab/hek cells/input/GFP.csv"
dir_DAPI_output = "/Users/sylvia618/Desktop/Chen Lab/hek cells/input/DAPI.csv"
area_filter_low = 0.416
area_filter_high = 150
merging_info(dir_file_GFP,dir_file_DAPI,area_filter_low,area_filter_high, dir_GFP_output,dir_DAPI_output)



'Finish Cleaning!!'

In [None]:
from cmath import inf


dir_file_GFP = "/Users/sylvia618/Desktop/Chen Lab/HEK+RAB/input/individual_aggregates.csv"
dir_file_DAPI = "/Users/sylvia618/Desktop/Chen Lab/HEK+RAB/input/summary_DAPI.csv"
dir_GFP_output = "/Users/sylvia618/Desktop/Chen Lab/HEK+RAB/input/unfiltered_GFP.csv"
dir_DAPI_output = "/Users/sylvia618/Desktop/Chen Lab/HEK+RAB/input/unfiltered_DAPI.csv"
area_filter_low = -inf
area_filter_high = inf
merging_info(dir_file_GFP,dir_file_DAPI,area_filter_low,area_filter_high, dir_GFP_output,dir_DAPI_output)