# Imputation Code

Here, we will walk through the imputation code to understand how we can leverage morphology to elicit protein biomarker expression

In [None]:
data=pd.read_csv('\\path to raw morphological csv\\')
#read in the dataframe with the excels

In [None]:
#import libraries                                                                             
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

# Construct PCs from the morphological dataset

In [None]:
#Parameter_Excel=pd.read_excel('\\path to filtered morphological parameters csv\\')
#morpho_params=communalities_sort['Parameter'].to_list()


sorted_dataframe=DF['morpho_params'] #filter out morphological parameters of interest to construct the principal components

pca=PCA(n_components=10, random_state=42) # start with ten PCs
pca.fit(sorted_dataframe) # fit the model to the dataframe 
pca_values=pca.transform(sorted_dataframe) #transform the sorted dataframe into PCA space
weights=np.array(pca.explained_variance_ratio_) # create weights based on the variance explained, want to keep PCs that cumulatively
#explain 95% of the variance 
PC_DF=pd.DataFrame(pca_values, columns = ['PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10'])
# now lets concat the pandas dataframe with the PCs 
PC_DF.index = data.index
data=pd.concat([data,PC_DF],axis=1)

# Imputation Functions

Below are a collection  of functions that will allow for the imputation of a biomarker on an individual cell basis

In [None]:
def weighted_averaging_biomarker_values(distances,biomarkers): # this function takes a weighted average of biomarker expression from nearest PC neighbors 

    sum_distances=np.sum(distances) #sum of all distances
    adjusted_weights=sum_distances/distances
    
    
    
    biomarker_weights=adjusted_weights/np.sum(adjusted_weights)
    weighted_average=np.sum(np.multiply(biomarker_weights,biomarkers))
    return weighted_average

In [None]:
def custom_distance(x,y, weights): #custom distance with a weight function for every PC distance 
    q=x-y # distance between two points
    return np.sqrt((weights*q*q).sum()) # calculates wieghted euclidian distance of PC variables 

In [None]:
def biomarker_imputer(overall_dataframe, cell_ID, biomarker,batch_corrected_value,weights,average=True):
    
    #first lets sort by the cel_line, below sorts for the string sequence to sort

    cell_row=overall_dataframe.loc[cell_ID]# sort the dataframe to the cell of interest

    cell_line=cell_row['Cell Treatment Condition'].split('_')[0] #sort by same age line (I.e. Young_ATV will be split into 'Young')
   
    
    #now lets do a series of filter steps to curate our excel sheet
    #Sort the large dataframe by age determined from step above
    Cell_sorted_dataframe=overall_dataframe[overall_dataframe['Cell Treatment Condition'].str.contains(cell_line)]
    
    
    #sort by KMEANS, this will circumscribe our search of nearest neighbors 
    
    Cell_sorted_dataframe=Cell_sorted_dataframe[Cell_sorted_dataframe['KMEANS']==cell_row['KMEANS']]
    
    
    
    #sort to find the biomarker of interest that is stained for 
    Cell_BM_sorted_dataframe= Cell_sorted_dataframe[Cell_sorted_dataframe['Primary Biomarker Stained']==biomarker_of_interest]



    if cell_ID in Cell_BM_sorted_dataframe.index: # drop the cell _ID from the dataframe that is filtered (if we already know the value)
        Cell_BM_sorted_dataframe=Cell_BM_sorted_dataframe.drop([cell_ID])

    Final_KMEANS_DF=Cell_BM_sorted_dataframe.append(cell_row) # keep a version of the dataframe with the cell of interest
    
    

    #store the index of interest
    index_position = Final_KMEANS_DF.index.get_indexer([cell_ID])[0]
    
    #call the kmean neartest neighbors to get the indices of interest
    nearest_neighbors,distances=knnearest_indices(Final_KMEANS_DF, weights,21)
    #now sort the indices of interest
    nearest_neighbors=nearest_neighbors[index_position]
    distances=distances[index_position]
    #now we will fitler the Final KMEANS by the sorted index position, and find their biomarker values (note these are batch corrected values)
    biomarker_imputation=Final_KMEANS_DF.iloc[nearest_neighbors][batch_corrected_value].values
    
    #now we will impute based on either average or weighted averages
    if average==False:
        biomarker_imputation=weighted_averaging_biomarker_values(distances,biomarker_imputation)
    else:
        biomarker_imputation=np.average(biomarker_imputation)
    
    return biomarker_imputation
    
    


In [None]:

#find a cell_id of interest, and impute

cell_ID='Bio1_0'
Biomarker='P16'
weights=np.array(pca.explained_variance_ratio_) 

imputed_biomarker=biomarker_imputer(data, cell_ID, Biomarker,'Primary Biomarker Batch Corrected',weights,average=False)