Author: Ningxin Kang (nik010@ucsd.edu)

Last Update: 10/11/2022

# Feature extraction and formatting

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import xlrd
import re

In [42]:
def info_extraction(mode,feature,
                    GFP_dir_input, DAPI_dir_input, 
                    per_pic_dir_output, per_mouse_dir_output, 
                    norm_pic_dir_output,norm_mouse_dir_output):
    
    df_GFP = pd.read_csv(GFP_dir_input,sep = ',')
    df_DAPI = pd.read_csv(DAPI_dir_input,sep = ',')
    
    if mode == "count":
        per_pic = pd.DataFrame(df_GFP.groupby(['picture_id']).count())
        # Rename and extract the targeted column
        per_pic.rename({'sample_index': feature}, axis=1, inplace=True)
    elif mode == "avg":
        per_pic = pd.DataFrame(df_GFP.groupby(['picture_id']).mean())
    elif mode == "total":
        per_pic = pd.DataFrame(df_GFP.groupby(['picture_id']).sum())
    
    # Format the dataframe by change rownames to column and use index as rownames
    per_pic = per_pic.rename_axis("picture_id").reset_index()

    # extract mouse id
    per_pic['mouse_id'] = [re.search('LG.*_.*_.*[BL]',per_pic['picture_id'][i]).group()
                            for i in range(per_pic.shape[0])]
    # extract category
    per_pic['category'] = [per_pic['mouse_id'][i].split('_')[-1]
                            for i in range(per_pic.shape[0])]
    
    per_pic = per_pic.loc[:,["category","mouse_id","picture_id",feature]]

    # Output dataframe
    per_pic.to_csv(per_pic_dir_output, sep = ",", index = False)
    
    # Calculate mean count for each mouse
    per_mouse = pd.DataFrame(per_pic.groupby(['mouse_id']).mean())
    # Format the dataframe by change rownames to column and use index as rownames
    per_mouse = per_mouse.rename_axis("mouse_id").reset_index()
    # extract category
    per_mouse['category'] = [per_mouse['mouse_id'][i].split('_')[-1]
                            for i in range(per_mouse.shape[0])]

    # Output dataframe
    per_mouse.to_csv(per_mouse_dir_output,sep = ",",index = False)
    
    if mode == "total" or mode == "count":
        norm_feature = feature+'_vs_DAPI'
        # merge the total area of aggregates per pic and DAPI by picture id
        merge_per_pic = pd.merge(per_pic, df_DAPI, how = "inner", on = "picture_id")
        
        # Calculate total area per DAPI and total IntDen per DAPI
        merge_per_pic[norm_feature] = merge_per_pic[feature]/merge_per_pic['Count']
        # Select columns
        merge_per_pic = merge_per_pic.loc[:,["category","mouse_id","picture_id",norm_feature]]

        merge_per_pic.to_csv(norm_pic_dir_output,sep = ",",index = False)
        
        # Calculate mean count for each mouse
        merge_per_mouse = pd.DataFrame(merge_per_pic.groupby(['mouse_id']).mean())
        # Format the dataframe by change rownames to column and use index as rownames
        merge_per_mouse = merge_per_mouse.rename_axis("mouse_id").reset_index()
        # extract category
        merge_per_mouse['category'] = [merge_per_mouse['mouse_id'][i].split('_')[-1]
                                for i in range(merge_per_mouse.shape[0])]

        merge_per_mouse.to_csv(norm_mouse_dir_output,sep = ",",index = False)
    return("Finish generating dataframes regarding: " + feature +" with Mode: " + mode)




In [48]:
########################################
# 1. Total IntDen & Total IntDen / DAPI#
########################################
# mode = ["count","avg","total"]
dir_output = "/Users/sylvia618/Desktop/Chen Lab/hek cells/input/v2/"
mode = "total"
feature = "IntDen"
dir_input = "/Users/sylvia618/Desktop/Chen Lab/hek cells/input/"
GFP_dir_input = dir_input + "GFP.csv"
DAPI_dir_input = dir_input + "DAPI.csv"
per_pic_dir_output = dir_output+"total_IntDen_per_pic.csv"
per_mouse_dir_output = dir_output+"total_IntDen_per_mouse.csv"
norm_pic_dir_output = dir_output+"IntDen_vs_DAPI_per_pic.csv"
norm_mouse_dir_output = dir_output+"IntDen_vs_DAPI_per_mouse.csv"

info_extraction(mode, feature,
                GFP_dir_input, DAPI_dir_input, 
                per_pic_dir_output, per_mouse_dir_output, 
                norm_pic_dir_output,norm_mouse_dir_output
                )

'Finish generating dataframes regarding: IntDen with Mode: total'

In [49]:
####################################
# 2. Total area & Total area / DAPI#
####################################
# mode = ["count","avg","total"]
dir_output = "/Users/sylvia618/Desktop/Chen Lab/hek cells/input/v2/"
mode = "total"
feature = "Area"
dir_input = "/Users/sylvia618/Desktop/Chen Lab/hek cells/input/"
GFP_dir_input = dir_input + "GFP.csv"
DAPI_dir_input = dir_input + "DAPI.csv"
per_pic_dir_output = dir_output+"total_area_per_pic.csv"
per_mouse_dir_output = dir_output+"total_area_per_mouse.csv"
norm_pic_dir_output = dir_output+"area_vs_DAPI_per_pic.csv"
norm_mouse_dir_output = dir_output+"area_vs_DAPI_per_mouse.csv"

info_extraction(mode, feature,
                GFP_dir_input, DAPI_dir_input, 
                per_pic_dir_output, per_mouse_dir_output, 
                norm_pic_dir_output,norm_mouse_dir_output
                )

'Finish generating dataframes regarding: Area with Mode: total'

In [50]:
########################################################
# 3. Number of aggregates & Number of aggregates / DAPI#
########################################################
# mode = ["count","avg","total"]
dir_output = "/Users/sylvia618/Desktop/Chen Lab/hek cells/input/v2/"
mode = "count"
feature = "num_aggregates"
dir_input = "/Users/sylvia618/Desktop/Chen Lab/hek cells/input/"
GFP_dir_input = dir_input + "GFP.csv"
DAPI_dir_input = dir_input + "DAPI.csv"
per_pic_dir_output = dir_output+"agg_per_pic.csv"
per_mouse_dir_output = dir_output+"agg_per_mouse.csv"
norm_pic_dir_output = dir_output+"agg_vs_DAPI_per_pic.csv"
norm_mouse_dir_output = dir_output+"agg_vs_DAPI_per_mouse.csv"

info_extraction(mode, feature,
                GFP_dir_input, DAPI_dir_input, 
                per_pic_dir_output, per_mouse_dir_output, 
                norm_pic_dir_output,norm_mouse_dir_output
                )

'Finish generating dataframes regarding: num_aggregates with Mode: count'

In [51]:
##################
# 4. Average area#
##################
# mode = ["count","avg","total"]
dir_output = "/Users/sylvia618/Desktop/Chen Lab/hek cells/input/v2/"
mode = "avg"
feature = "Area"
dir_input = "/Users/sylvia618/Desktop/Chen Lab/hek cells/input/"
GFP_dir_input = dir_input + "GFP.csv"
DAPI_dir_input = dir_input + "DAPI.csv"
per_pic_dir_output = dir_output+"avg_area_per_pic.csv"
per_mouse_dir_output = dir_output+"avg_area_per_mouse.csv"
norm_pic_dir_output = None
norm_mouse_dir_output = None

info_extraction(mode, feature,
                GFP_dir_input, DAPI_dir_input, 
                per_pic_dir_output, per_mouse_dir_output, 
                norm_pic_dir_output,norm_mouse_dir_output
                )

'Finish generating dataframes regarding: Area with Mode: avg'

In [52]:
####################
# 5. Average IntDen#
####################
# mode = ["count","avg","total"]
dir_output = "/Users/sylvia618/Desktop/Chen Lab/hek cells/input/v2/"
mode = "avg"
feature = "IntDen"
dir_input = "/Users/sylvia618/Desktop/Chen Lab/hek cells/input/"
GFP_dir_input = dir_input + "GFP.csv"
DAPI_dir_input = dir_input + "DAPI.csv"
per_pic_dir_output = dir_output+"avg_IntDen_per_pic.csv"
per_mouse_dir_output = dir_output+"avg_IntDen_per_mouse.csv"
norm_pic_dir_output = None
norm_mouse_dir_output = None

info_extraction(mode, feature,
                GFP_dir_input, DAPI_dir_input, 
                per_pic_dir_output, per_mouse_dir_output, 
                norm_pic_dir_output,norm_mouse_dir_output
                )

'Finish generating dataframes regarding: IntDen with Mode: avg'