In [1]:
import anndata
import scanpy as sc
import numpy as np
import pickle
import os
import fnmatch

In [2]:
'''FUNCTIONS FOR CALCULATIONS IN FIGURE 2'''

# This function finds the path to the files using a keyword (Chat GBT)
def search_files(folder_path, keyword):
    matches = []
    for root, dirnames, filenames in os.walk(folder_path):
        for filename in filenames:
            if fnmatch.fnmatch(filename, f'*{keyword}*'):
                matches.append(os.path.join(root, filename))
    return matches

# Saves a list onto a .TXT file (Chat GBT)
def save_list_to_file(lst, file_path):
    with open(file_path, 'w') as file:
        for item in lst:
            file.write(str(item) + '\n')

# Sorts the list of cluster numbers in increasing order
def sortClust(clusterStrLst):
    # Turns list of cluster number strings into intigers 
    clusterNumLst = [int(string) for string in clusterStrLst]
    #sorts numbers 
    clusterNumLst.sort()
    # Turns numbers into strings again 
    sortedClusterStrLst = [str(num) for num in clusterNumLst]
    
    return(sortedClusterStrLst)

# This function calculates the percent count of cells in each sample half in each cluster
def calcCellsPerClust(sample, sampleHalfLst, sample1_name, sample2_name, groupby, conditionStr):
    countDict = {} # where we will store the all of the clust#x4 array 
    
    # Cluster number info 
    sampleClustLst = list(np.unique(sample.obs['leiden_0.3']))
    sortedSampleClustLst = sortClust(sampleClustLst)
    
    # Getting individual sample counts from full data
    fullSampleLst = list(sample.obs[groupby])
    sample1_Count = fullSampleLst.count(sample1_name)
    sample2_Count = fullSampleLst.count(sample2_name)
    
    # Adding sample identifying information to the dictionary  
    countDict['condition'] = conditionStr  
    countDict['sampleHalfNames'] = sampleHalfLst
    
    # Looping through each cluster for a set of biological replicas 
    for clust in sortedSampleClustLst:  
        clustObj = sample[sample.obs['leiden_0.3'].isin([clust]),:] # Anndata object of cluster
        sampleHalfsData = list(clustObj.obs['Half']) # List of the sample half identifier for each nuclei 
        
        # Looping through the list of the 4 sample half names
        for sampleHalfName in sampleHalfLst:
            # Counting how many nuclei came from a certain sample and calculating the percent of nuclei per sample
            if sample1_name in sampleHalfName: 
                sampleHalf1Count = sampleHalfsData.count(sampleHalfName)
                percentSampleHalfCount = (sampleHalf1Count / sample1_Count) * 100
            else:
                sampleHalf2Count = sampleHalfsData.count(sampleHalfName)
                percentSampleHalfCount = (sampleHalf2Count / sample2_Count) * 100
            
            # Adding count to list 
            if sampleHalfName not in (countDict.keys()):
                countDict[sampleHalfName] = [percentSampleHalfCount]
            else:
                countDict[sampleHalfName].append(percentSampleHalfCount)
    
    return countDict

# This function creates the countDict for a list of sample that you give it 
def savingCountDict(sampleComparLst, folderPath, savePath):
    for sampleLst in sampleComparLst: 
        condition = sampleLst[0]

        # Within sample calculation 
        if len(sampleLst) == 2:
            sample_name = sampleLst[1]
            sample1_name = f'{sample_name} Half 1'
            sample2_name = f'{sample_name} Half 2'
            sampleFilePaths = search_files(folderPath, sample_name)
            sample1_obj = sc.read(sampleFilePaths[0])
            sample2_obj = sc.read(sampleFilePaths[1])
            sample1_obj.obs['Half'] = f'{sample_name} Half 1'
            sample2_obj.obs['Half'] = f'{sample_name} Half 2'
            sampleObj = anndata.concat([sample1_obj, sample2_obj], join="outer")
            groupby = 'Half'
            
            # Re-Clustering 
            # PCA
            sc.tl.pca(sampleObj, svd_solver='arpack')

            # Neighbors and UMAP Computations
            sc.pp.neighbors(sampleObj, n_neighbors=50, n_pcs =50, use_rep = "Scanorama")
            sc.tl.umap(sampleObj) 

            # Re-calculating cluster numbers 
            sc.tl.leiden(sampleObj, resolution = 0.3, key_added = "leiden_0.3")
            
            sampleHalfLst = [sample1_name, sample2_name]
            countDict = calcCellsPerClust(sampleObj, sampleHalfLst, sample1_name, sample2_name, groupby, condition)

            filePath = f'{savePath}countDict_{sample_name}.pkl'

            with open(filePath, 'wb') as file:
                pickle.dump(countDict, file)

        # Between sample calculation 
        else:
            sample1_name = sampleLst[1]
            sample2_name = sampleLst[2]
            sampleFilePaths = search_files(folderPath, f'{sample1_name}_{sample2_name}')
            sampleObj = sc.read(sampleFilePaths[0])
            groupby = 'sample'

            sampleHalfLst = list(np.unique(sampleObj.obs['Half']))
            countDict = calcCellsPerClust(sampleObj, sampleHalfLst, sample1_name, sample2_name, groupby, condition)

            filePath = f'{savePath}countDict_{sample1_name}_{sample2_name}.pkl'

            with open(filePath, 'wb') as file:
                pickle.dump(countDict, file)

# This function calculates the percent difference between samples in each cluster 
def calcSampleDiff(countDict, sample1_name, sample2_name):
    percentDiffDict = {} # where we will store the all of the clust#x4 array 
    colorList = [] # where the color of the bars for the barplot will be stored (according to which sample has more)
    
    # Getting individual sample counts from full data
    sampleHalfLst = countDict['sampleHalfNames']
    
    clustNums = [i for i in range(len(countDict[sampleHalfLst[0]]))]
    # Looping through each cluster for a set of biological replicas 
    for i in clustNums:  
        sample1_perCount = 0 # sum of 2 sample half percent counts for sample 1
        sample2_perCount = 0 # sum of 2 sample half percent counts for sample 2
        percentDiff = 0 # final calculation 
        
        # Between sample calcs
        if len(sampleHalfLst) == 4:
            # Looping through each sample half 
            for sampleHalf in sampleHalfLst:
                sampleCountLst = countDict[sampleHalf]
                sampleCounts = sampleCountLst[i]
                # Adding the percent counts of each sample 
                if sample1_name in sampleHalf:
                    sample1_perCount += sampleCounts
                else:
                    sample2_perCount += sampleCounts
        # Within sample calcs         
        else:
            sample1_dict = countDict[sample1_name]
            sample2_dict = countDict[sample2_name]
            sample1_perCount = sample1_dict[i]
            sample2_perCount = sample2_dict[i]
            
        # Doing percent difference calculation depending on which sample is larger         
        if sample1_perCount > sample2_perCount:
            percentDiff = (1 - (sample2_perCount / sample1_perCount)) * 100 
            colorList.append('firebrick')
        else:
            percentDiff = (1 - (sample1_perCount / sample2_perCount)) * 100 
            colorList.append('cadetblue')
        # Adding percent diff. calculation with the cluster # as key 
        percentDiffDict[i] = percentDiff
    
    sampleOrder = [sample1_name, sample2_name]
    
    return percentDiffDict, colorList, clustNums, sampleOrder

# This function creates the countDict for a list of sample that you give it 
def savingSampleDiff(sampleComparLst, folderPath, savePath):
    for sampleLst in sampleComparLst: 
        condition = sampleLst[0]

        # Within sample calculation 
        if len(sampleLst) == 2:
            sample_name = sampleLst[1]
            sample1_name = f'{sample_name} Half 1'
            sample2_name = f'{sample_name} Half 2'
            sampleFilePaths = search_files(folderPath, sample_name)
            
            with open((sampleFilePaths[0]), 'rb') as file:
                countDict = pickle.load(file)

            sampleDiff, colorList, clustNum, sampleOrder = calcSampleDiff(countDict, sample1_name, sample2_name)

            filePath = f'{savePath}sampleDiff_{sample_name}.pkl'

            with open(filePath, 'wb') as file:
                pickle.dump(sampleDiff, file)

            save_list_to_file(colorList, f'{savePath}colorList_{sample_name}.txt')
            save_list_to_file(clustNum, f'{savePath}clustNum_{sample_name}.txt')
            save_list_to_file(sampleOrder, f'{savePath}sampleOrder_{sample_name}.txt')
            
        # Between sample calculation 
        else:
            sample1_name = sampleLst[1]
            sample2_name = sampleLst[2]
            sampleFilePaths = search_files(folderPath, f'{sample1_name}_{sample2_name}')

            with open((sampleFilePaths[0]), 'rb') as file:
                countDict = pickle.load(file)

            sampleDiff, colorList, clustNum, sampleOrder = calcSampleDiff(countDict, sample1_name, sample2_name)  
            
            filePath = f'{savePath}sampleDiff_{sample1_name}_{sample2_name}.pkl'

            with open(filePath, 'wb') as file:
                pickle.dump(sampleDiff, file)
            
            save_list_to_file(colorList, f'{savePath}colorList_{sample1_name}_{sample2_name}.txt')
            save_list_to_file(clustNum, f'{savePath}clustNum_{sample1_name}_{sample2_name}.txt')
            save_list_to_file(sampleOrder, f'{savePath}sampleOrder_{sample1_name}_{sample2_name}.txt')

In [3]:
'''SAMPLE COMPARISON LISTS'''

sampleNameLst = [['MD Contra. Hem. (L)', '1_VC20L'], ['MD Contra. Hem. (L)', '2_AC20L'], ['MD Ipsi. Hem. (R)', '3_VC20R'], 
                 ['MD Ipsi. Hem. (R)', '4_AC20R'], ['Deafened', '5_VC22L'], ['Deafened', '6_AC22L'], ['Control', '7_VC24L'],
                 ['Control', '8_AC24L'], ['Control', '9_VC23L'], ['Deafened', '10_VC25L'], ['Deafened', '13_AC25L'], 
                 ['Control', '14_AC23L'], ['MD Contra. Hem. (L)', '16_VC19L'], ['MD Ipsi. Hem. (R)', '17_VC19R'], 
                 ['MD Contra. Hem. (L)', '19_AC19L'], ['MD Ipsi. Hem. (R)', '20_AC19R']]

bioRepComparisonLst = [['Control', '7_VC24L', '9_VC23L'], ['Control', '8_AC24L', '14_AC23L'], 
                       ['Deafened', '5_VC22L', '10_VC25L'], ['Deafened', '6_AC22L', '13_AC25L'], 
                       ['MD Contra. Hem. (L)', '16_VC19L', '1_VC20L'], ['MD Contra. Hem. (L)', '19_AC19L', '2_AC20L'], 
                       ['MD Ipsi. Hem. (R)', '17_VC19R', '3_VC20R'], ['MD Ipsi. Hem. (R)', '20_AC19R', '4_AC20R']]

plastComparLst = [['Deaf VC vs. Control VC', '5_VC22L', '7_VC24L'], ['Deaf VC vs. Control VC', '5_VC22L', '9_VC23L'], 
                  ['Deaf VC vs. Control VC', '10_VC25L', '7_VC24L'], ['Deaf VC vs. Control VC', '10_VC25L', '9_VC23L'],
                  ['Deaf AC vs. Control AC', '6_AC22L', '8_AC24L'], ['Deaf AC vs. Control AC', '6_AC22L', '14_AC23L'],
                  ['Deaf AC vs. Control AC', '13_AC25L', '8_AC24L'], ['Deaf AC vs. Control AC', '13_AC25L', '14_AC23L'],
                  ['MD VC vs. Control VC', '16_VC19L', '7_VC24L'], ['MD VC vs. Control VC', '16_VC19L', '9_VC23L'],
                  ['MD VC vs. Control VC', '1_VC20L', '7_VC24L'], ['MD VC vs. Control VC', '1_VC20L', '9_VC23L'],
                  ['MD AC vs. Control AC', '19_AC19L', '8_AC24L'], ['MD AC vs. Control AC', '19_AC19L', '14_AC23L'],
                  ['MD AC vs. Control AC', '2_AC20L', '8_AC24L'], ['MD AC vs. Control AC', '2_AC20L', '14_AC23L']]

In [72]:
'''COMBINING INDIVIDUAL SAMPLE HALVES SO THAT THEY ARE LABELED AND SAVING THE RECLUSTERED ANNDATA OBJECT'''

for sampleLst in sampleNameLst:
    sample_name = sampleLst[1]
    
    folderPath = 'F:/SampleData/SampleHalves/'
    sampleFilePaths = search_files(folderPath, sample_name)
    
    print(sample_name)
    
    sample1_obj = sc.read(sampleFilePaths[0])
    sample2_obj = sc.read(sampleFilePaths[1])
    
    sample1_obj.obs['Half'] = f'{sample_name} Half 1'
    sample2_obj.obs['Half'] = f'{sample_name} Half 2'
    
    sampleObj = anndata.concat([sample1_obj, sample2_obj], join="outer")
    
    sc.pp.calculate_qc_metrics(sampleObj, percent_top=None, log1p=False, inplace=True)
    
    # Re-Clustering 
    # PCA
    sc.tl.pca(sampleObj, svd_solver='arpack')

    # Neighbors and UMAP Computations
    sc.pp.neighbors(sampleObj, n_neighbors=50, n_pcs =50, use_rep = "Scanorama")
    sc.tl.umap(sampleObj) 

    # Re-calculating cluster numbers 
    sc.tl.leiden(sampleObj, resolution = 0.3, key_added = "leiden_0.3")
    
    savePath = 'F:/SampleData/CombinedSampleHalves/'
    sampleObj.write(f'{savePath}{sample_name}.h5ad')

1_VC20L
2_AC20L
3_VC20R
4_AC20R
5_VC22L
6_AC22L
7_VC24L
8_AC24L
9_VC23L
10_VC25L
13_AC25L
14_AC23L
16_VC19L
17_VC19R
19_AC19L
20_AC19R


In [73]:
'''USING CONCAT SAMPLES AS A REFERENCE TO LABEL WHICH HALF EACH NUCLEI IS IN'''

for sampleLst in sampleNameLst:
    sample_name = sampleLst[1]
    
    # Load reference AnnData object
    folderPath = 'F:/SampleData/CombinedSampleHalves/'
    refSampleFilePaths = search_files(folderPath, sample_name)
    ref_data =  sc.read(refSampleFilePaths[0])
   
    # Load target AnnData object
    folderPath = 'F:/SampleData/IndividualSamples/'
    targetSampleFilePaths = search_files(folderPath, sample_name)
    target_data = sc.read(targetSampleFilePaths[0])

    # Assuming 'category' is stored in 'ref_data.obs' and 'cell_id' is the common identifier
    ref_categories = dict(zip(ref_data.obs['barcode'], ref_data.obs['Half']))

    # Assign categories to cells in target_data.obs
    target_data.obs['Half'] = [ref_categories.get(barcode, 'Unknown') for barcode in target_data.obs_names]

    # Save the modified target AnnData object
    target_data.write(f'F:/SampleData/IndividualSamples/{sample_name}.h5ad')

In [4]:
sampleComparLst = sampleNameLst
folderPath = 'F:/SampleData/SampleHalves/'
savePath = 'F:/FigureCalculations/Figure2/WithinSample/PercentCount/'

savingCountDict(sampleComparLst, folderPath, savePath)

In [5]:
sampleComparLst = sampleNameLst
folderPath = 'F:/FigureCalculations/Figure2/WithinSample/PercentCount/'
savePath = 'F:/FigureCalculations/Figure2/WithinSample/CountDifference/'

savingSampleDiff(sampleComparLst, folderPath, savePath)

In [6]:
sampleComparLst = bioRepComparisonLst
folderPath = 'F:/SampleData/BiologicalReplicaSamples/'
savePath = 'F:/FigureCalculations/Figure2/BiologicalReplica/PercentCount/'

savingCountDict(sampleComparLst, folderPath, savePath)

Only considering the two last: ['. (L)_16_VC19L_1_VC20L', '.h5ad'].
Only considering the two last: ['. (L)_16_VC19L_1_VC20L', '.h5ad'].
Only considering the two last: ['. (L)_19_AC19L_2_AC20L', '.h5ad'].
Only considering the two last: ['. (L)_19_AC19L_2_AC20L', '.h5ad'].
Only considering the two last: ['. (R)_17_VC19R_3_VC20R', '.h5ad'].
Only considering the two last: ['. (R)_17_VC19R_3_VC20R', '.h5ad'].
Only considering the two last: ['. (R)_20_AC19R_4_AC20R', '.h5ad'].
Only considering the two last: ['. (R)_20_AC19R_4_AC20R', '.h5ad'].


In [7]:
sampleComparLst = bioRepComparisonLst
folderPath = 'F:/FigureCalculations/Figure2/BiologicalReplica/PercentCount/'
savePath = 'F:/FigureCalculations/Figure2/BiologicalReplica/CountDifference/'

savingSampleDiff(sampleComparLst, folderPath, savePath)

In [8]:
sampleComparLst = plastComparLst
folderPath = 'F:/SampleData/PlasticityComparisons/'
savePath = 'F:/FigureCalculations/Figure2/PlasticityComparisons/PercentCount/'

savingCountDict(sampleComparLst, folderPath, savePath) 

In [9]:
sampleComparLst = plastComparLst
folderPath = 'F:/FigureCalculations/Figure2/PlasticityComparisons/PercentCount/'
savePath = 'F:/FigureCalculations/Figure2/PlasticityComparisons/CountDifference/'

savingSampleDiff(sampleComparLst, folderPath, savePath)