In [1]:
import anndata, pickle, os, fnmatch, math, random
import scanpy as sc
import numpy as np
import pandas as pd

# Lists Used in All Files 

In [1]:
# The sex of each mouse
mouseSexLst = [['19', 'male'], ['20', 'male'], ['22', 'male'], ['23', 'female'], ['24', 'male'], ['25', 'female']]

In [2]:
# All sample names 
sampleNameLst = ['1_VC20L', '2_AC20L', '3_VC20R', '4_AC20R', '5_VC22L', '6_AC22L', '7_VC24L', '8_AC24L', '9_VC23L', 
               '10_VC25L', '13_AC25L', '14_AC23L', '16_VC19L', '17_VC19R', '19_AC19L', '20_AC19R']

In [3]:
# The Condition of each Sample
sampleNameConditionLst = [['MD Contra. Hem. (L)', '1_VC20L'], ['MD Contra. Hem. (L)', '2_AC20L'],
                          ['MD Ipsi. Hem. (R)', '3_VC20R'], ['MD Ipsi. Hem. (R)', '4_AC20R'], ['Deafened', '5_VC22L'], 
                          ['Deafened', '6_AC22L'], ['Control', '7_VC24L'], ['Control', '8_AC24L'], ['Control', '9_VC23L'], 
                          ['Deafened', '10_VC25L'], ['Deafened', '13_AC25L'], ['Control', '14_AC23L'], 
                          ['MD Contra. Hem. (L)', '16_VC19L'], ['MD Ipsi. Hem. (R)', '17_VC19R'], 
                          ['MD Contra. Hem. (L)', '19_AC19L'], ['MD Ipsi. Hem. (R)', '20_AC19R']]

In [4]:
# Biological Replicate Samples to Compare 
bioRepComparisonLst = [['Control', '7_VC24L', '9_VC23L'], ['Control', '8_AC24L', '14_AC23L'], 
                       ['Deafened', '5_VC22L', '10_VC25L'], ['Deafened', '6_AC22L', '13_AC25L'], 
                       ['MD Contra. Hem. (L)', '16_VC19L', '1_VC20L'], ['MD Contra. Hem. (L)', '19_AC19L', '2_AC20L'], 
                       ['MD Ipsi. Hem. (R)', '17_VC19R', '3_VC20R'], ['MD Ipsi. Hem. (R)', '20_AC19R', '4_AC20R']]

In [5]:
# Plasticity Samples to Compare 
plastComparLst = [['VC Deaf vs. Control', '5_VC22L', '7_VC24L'], ['VC Deaf vs. Control', '5_VC22L', '9_VC23L'], 
                  ['VC Deaf vs. Control', '10_VC25L', '7_VC24L'], ['VC Deaf vs. Control', '10_VC25L', '9_VC23L'],
                  ['AC Deaf vs. Control', '6_AC22L', '8_AC24L'], ['AC Deaf vs. Control', '6_AC22L', '14_AC23L'],
                  ['AC Deaf vs. Control', '13_AC25L', '8_AC24L'], ['AC Deaf vs. Control', '13_AC25L', '14_AC23L'],
                  ['VC MD vs. Control', '16_VC19L', '7_VC24L'], ['VC MD vs. Control', '16_VC19L', '9_VC23L'],
                  ['VC MD vs. Control', '1_VC20L', '7_VC24L'], ['VC MD vs. Control', '1_VC20L', '9_VC23L'],
                  ['AC MD vs. Control', '19_AC19L', '8_AC24L'], ['AC MD vs. Control', '19_AC19L', '14_AC23L'],
                  ['AC MD vs. Control', '2_AC20L', '8_AC24L'], ['AC MD vs. Control', '2_AC20L', '14_AC23L']]

In [6]:
# Hemispheric Samples to Compare 
hemComparLst = [['VC Contra (L) vs. Ipsi Hem (R)', '16_VC19L', '17_VC19R'], 
                ['AC Contra (L) vs. Ipsi Hem (R)', '19_AC19L', '20_AC19R'], 
                ['VC Contra (L) vs. Ipsi Hem (R)', '1_VC20L', '3_VC20R'], 
                ['AC Contra (L) vs. Ipsi Hem (R)', '2_AC20L','4_AC20R']]

# General Calculation Functions

In [2]:
# Reads dictionaries that have been saved in .pkl files 
def readDict(filePath):
    with open(filePath, 'rb') as file:
        dictionary = pickle.load(file)
    return(dictionary)

In [None]:
# Sorts the list of cluster numbers in increasing order
def sortClust(clusterStrLst):
    # Turns list of cluster number strings into intigers 
    clusterNumLst = [int(string) for string in clusterStrLst]
    #sorts numbers 
    clusterNumLst.sort()
    # Turns numbers into strings again 
    sortedClusterStrLst = [str(num) for num in clusterNumLst]
    
    return(sortedClusterStrLst)

In [None]:
# This function finds the path to the files using a keyword (Chat GBT)
def search_files(folder_path, keyword):
    matches = []
    for root, dirnames, filenames in os.walk(folder_path):
        for filename in filenames:
            if fnmatch.fnmatch(filename, f'*{keyword}*'):
                matches.append(os.path.join(root, filename))
    return matches

In [None]:
def mouseSex(sampleName):
    for mouseInfo in mouseSexLst:
        mouseNum = mouseInfo[0]
        mouseSex = mouseInfo[1]
        
        if mouseNum in sampleName: return mouseSex

In [1]:
def comparingSex(sample1_name, sample2_name):
    
    sample1_sex = mouseSex(sample1_name)
    sample2_sex = mouseSex(sample2_name)
    
    if sample1_sex[0] == sample2_sex[0]: return(True)
    else: return(False)

In [None]:
# This function checks whether a file exists in a given path 
def check_file_exists(folder_path, file_name):
    file_path = os.path.join(folder_path, file_name)
    if os.path.exists(file_path):
        return True
    else:
        return False

In [None]:
# Saves a list onto a .TXT file (Chat GBT)
def save_list_to_file(lst, file_path):
    with open(file_path, 'w') as file:
        for item in lst:
            file.write(str(item) + '\n')

In [None]:
# Loads a list from a .TXT file (Chat GBT)
def load_list_from_file(file_path):
    with open(file_path, 'r') as file:
        lst = [line.strip() for line in file]
    return lst

In [None]:
# Adding the specified DE Score to the given AnnData File
    # sampleObj = AnnData object you want to add the DE score to
    # divideby = string of the category in data.obs you want to divide the sample by 
    # cat1 = string of one of the two categories in the divide by data.vars column
    # cat2 = string of the other of the two categories in the divide by data.vars column 
def addingDEscore(sampleObj, divideby, cat1, cat2):
    
    # Adds missing field to 'data.uns' that is necessary for running the ranked_sum_genes function
    sampleObj.uns['log1p'] = {'base': None}

    # Calculating DE Scores 
    sc.tl.rank_genes_groups(sampleObj, divideby, method='wilcoxon')

    # Getting DE Info 
    score_df = pd.DataFrame(sampleObj.uns['rank_genes_groups']['scores'])
    geneName_df = pd.DataFrame(sampleObj.uns['rank_genes_groups']['names'])

    # Renaming the columns of each Data Frame 
    geneName_df = geneName_df.rename(columns={cat1: f'{cat1} GeneName'})
    geneName_df = geneName_df.rename(columns={cat2: f'{cat2} GeneName'})
    score_df = score_df.rename(columns={cat1: f'{cat1} Score'})
    score_df = score_df.rename(columns={cat2: f'{cat2} Score'})

    # Concatenating the two Data Frames
    cat1_df = pd.concat([geneName_df[f'{cat1} GeneName'], score_df[f'{cat1} Score']], axis=1)
    cat2_df = pd.concat([geneName_df[f'{cat2} GeneName'], score_df[f'{cat2} Score']], axis=1)

    # Adding an Index Column to the Data Frames
    cat1_df.index = cat1_df[f'{cat1} GeneName']
    cat2_df.index = cat2_df[f'{cat2} GeneName']

    # Reordering the Data Frames According to the Gene Names in the AnnData Object 
    refGeneLst = list(sampleObj.var_names)
    reordered_cat1_df = cat1_df.reindex(refGeneLst)
    reordered_cat2_df = cat2_df.reindex(refGeneLst)

    sampleObj.var[f'{cat1}_DE_Score'] = reordered_cat1_df[f'{cat1} Score']
    sampleObj.var[f'{cat2}_DE_Score'] = reordered_cat2_df[f'{cat2} Score']

    return sampleObj

In [None]:
# Halves the sample data 
    # data = the sample AnnData object you want to halve randomly 
    # seed = number use to randomly permutate the rows of data.obs, they are used to produce reproducible randomizations
        # put 'None' if you do not have seeds you've previously used
        # else, put the numer of the seed you previously used 
def halvingData(data, seed):
    #Set a random number for the seed, unless you input a seed you want to use to reproduce reuslts 
    if seed == None: seedUsed = np.random.randint(1, 100)
    else: seedUsed = seed 
    
    # Set a random seed for reproducibility 
    np.random.seed(seedUsed)

    # Get the number of cells in the dataset
    n_cells = data.n_obs

    # Create a random permutation of the cell indices
    permuted_indices = np.random.permutation(n_cells)

    # Split the indices in half
    halfway_point = int(n_cells / 2)
    indices_half1 = permuted_indices[:halfway_point]
    indices_half2 = permuted_indices[halfway_point:]

    # Create two new AnnData objects using the split indices
    data_half1 = data[indices_half1]
    data_half2 = data[indices_half2]
    
    return (data_half1, data_half2, str(seedUsed))

# General Plotting Functions

In [None]:
# Create Ranked Gene Plot
    # sampleObj = AnnData object you want to create the DE graph for
    # condition1 = string of the category to want to make the DE graph for
    # condition2 = string of the 
def deGraph(sampleObj, condition1, condition2, plotTitle_fontSize, ax):

    # Extract the gene names and scores for plotting
    gene_names = sampleObj.uns['rank_genes_groups']['names'][condition1][:20]
    scores = sampleObj.uns['rank_genes_groups']['scores'][condition1][:20]

    # Plot the bar chart on the subplot
    x = np.arange(len(gene_names))
    ax.scatter(x, scores, c='white')
    ax.set_ylim(top = (max(scores)+20))
    
    # Add gene names as labels to the data points
    for i, gene in enumerate(gene_names):
        ax.text(i, scores[i], gene, ha='center', va='bottom', rotation=90)
    
    # Font Size Params
#     plotTitle_fontSize = 25
    axisLabel_fontSize = 17
    tickLabel_fontSize = 15
    
    # Customizing the Axis
    ax.set_xticks(x)
    ax.set_xlabel('Genes')
#     ax.tick_params(axis='x', labelsize=tickLabel_fontSize)
    
    ax.set_ylabel('Score')
#     ax.tick_params(axis='y', labelsize=tickLabel_fontSize)
    
    # Adding Plot Title Accordingly
    ax.set_title(f'Rank Genes Groups: {condition1} vs. {condition2}', fontsize=plotTitle_fontSize)
     

In [None]:
def labeledClusteredUmap(sampleObj, color, label, plotTitle_fontSize, ax): 
    # Clustered UMAP Seperated by Sex 
    if label != 'on margin':
        sc.pl.umap(sampleObj, color = color, show = False, ax=ax)
    else:
        sc.pl.umap(sampleObj, color = color, legend_loc = 'right margin',  show = False, ax=ax)
    
    # Adding the cluster numbers to the graph (when the color is not set to 'leiden_0.3' the cluster nums are not disp.)
    for i, cluster in enumerate(sampleObj.obs['leiden_0.3'].cat.categories):
        x = st.median(sampleObj.obsm['X_umap'][sampleObj.obs['leiden_0.3'] == cluster, 0])
        y = st.median(sampleObj.obsm['X_umap'][sampleObj.obs['leiden_0.3'] == cluster, 1])
        ax.text(x, y, cluster, fontsize=10, ha='center', va='center', fontweight='heavy')
    ax.set_title(f'Clustered UMAP Divided by {color}', fontsize=plotTitle_fontSize)
    
    # Adding legend to the plot
    if label == 'sex':
        patchA = mpatches.Patch(color='tab:blue', label='Male Samples')
        patchB = mpatches.Patch(color='tab:orange', label='Female Samples')
        ax.legend(handles=[patchA, patchB], loc='upper left')
    
    axisLabel_fontSize = 15
    
    ax.set_xlabel('UMAP1', fontsize=axisLabel_fontSize)
    ax.set_ylabel('UMAP2', fontsize=axisLabel_fontSize)