In [1]:
import anndata, random, os, fnmatch
import scanpy as sc
import numpy as np

In [2]:
'''READING THE 'scanpy_mouse_FINAL_DATA.h5ad' FILE FROM YOUR COMPUTER AND KEEPING IT IN THE THE 'adata' VARIABLE'''

directory = 'F:/scanpy_mouse_FINAL_DATA.h5ad'
adata = sc.read_h5ad(directory)

In [3]:
'''THIS CELL ALLOWS YOU TO EASILY CHANGE THE DATA YOU ARE ANALYZING'''

# Set 'data' equal to the data set you want to process throughout the rest of the code (MUST BE ANNDATA OBJECT!)
data = adata 

In [4]:
'''ADDING MISSING IDENTIFYING FIELDS TO THE ANNDATA OBJECT'''

# Adds missing field to 'data.uns' that is necessary for running the ranked_sum_genes function
data.uns['log1p'] = {'base': None}

# Corrects name of sample '8_AC24L' which was missing the batch number 
data.obs['sample'] = ['8_AC24L' if sampleName == '8_AC24' else sampleName for sampleName in data.obs['sample']]

In [14]:
'''IMPORTING FUNCTIONS FROM FUNCTIONS FILE'''

%run GeneralFunctions.ipynb
%run DataObjectCreation_Functions.ipynb

# Individual Samples

In [8]:
'''CREATING INDIVIDUAL SAMPLE FILES'''

for sampleName in sampleNameLst:
    
    # Getting Sample Specific Condition and Sex Information
    condition = mouseCondition(sampleName)
    sex = mouseSex(sampleName)
    
    # Create Sample AnnData Object 
    sampleObj = data[data.obs['sample'].isin([sampleName]),:]
    
    # Adding Sex and Condition Information to AnnData Object 
    sampleObj.obs['sex'] = sex
    sampleObj.obs['condition'] = condition
    
    # Re-Clustering 
    # PCA
    sc.tl.pca(sampleObj, svd_solver='arpack')

    # Neighbors and UMAP Computations
    sc.pp.neighbors(sampleObj, n_neighbors=50, n_pcs =50, use_rep = "Scanorama")
    sc.tl.umap(sampleObj) 

    # Re-calculating cluster numbers 
    sc.tl.leiden(sampleObj, resolution = 0.3, key_added = "leiden_0.3")
 
    # Calculates nuclei and gene information specific to the sample object 
    sc.pp.calculate_qc_metrics(sampleObj, percent_top=None, log1p=False, inplace=True)
    
    # Saving Sample AnnData Object
    sampleObj.write_h5ad(f'F:/SampleData/IndividualSamples/{sampleName}.h5ad')

  sampleObj.obs['sex'] = sex
  next(self.gen)
  sampleObj.obs['sex'] = sex
  next(self.gen)
  sampleObj.obs['sex'] = sex
  next(self.gen)
  sampleObj.obs['sex'] = sex
  next(self.gen)
  sampleObj.obs['sex'] = sex
  next(self.gen)
  sampleObj.obs['sex'] = sex
  next(self.gen)
  sampleObj.obs['sex'] = sex
  next(self.gen)
  sampleObj.obs['sex'] = sex
  next(self.gen)
  sampleObj.obs['sex'] = sex
  next(self.gen)
  sampleObj.obs['sex'] = sex
  next(self.gen)
  sampleObj.obs['sex'] = sex
  next(self.gen)
  sampleObj.obs['sex'] = sex
  next(self.gen)
  sampleObj.obs['sex'] = sex
  next(self.gen)
  sampleObj.obs['sex'] = sex
  next(self.gen)
  sampleObj.obs['sex'] = sex
  next(self.gen)
  sampleObj.obs['sex'] = sex


  next(self.gen)


# Sample Halves

In [9]:
'''DIVIDING THE EACH SAMPLE IN THE DATA INTO HALVES - SPECIFIED RANDOMIZATION SEEDS (REPLICATION OF GRAPHS)''' 

for sampleLst in sampleSeedList:
    # Getting sample information 
    sampleName = sampleLst[0]
    specifiedSeed = sampleLst[1]

    # Reading the AnnData object file for the sample 
    sampleObj = anndata.read_h5ad(f'F:/SampleData/IndividualSamples/{sampleName}.h5ad')
    
    # Randomly halves the sample data 
    sampleHalf1, sampleHalf2, seedUsed = halvingData(sampleObj, specifiedSeed)

    # Re-Clustering Sample 1
    # PCA
    sc.tl.pca(sampleHalf1, svd_solver='arpack')

    # Neighbors and UMAP Computations
    sc.pp.neighbors(sampleHalf1, n_neighbors=50, n_pcs =50, use_rep = "Scanorama")
    sc.tl.umap(sampleHalf1) 

    # Re-calculating cluster numbers 
    sc.tl.leiden(sampleHalf1, resolution = 0.3, key_added = "leiden_0.3")
    
    # Calculates nuclei and gene information specific to the sample object 
    sc.pp.calculate_qc_metrics(sampleHalf1, percent_top=None, log1p=False, inplace=True)
    
    # Re-Clustering Sample 2 
    # PCA
    sc.tl.pca(sampleHalf2, svd_solver='arpack')

    # Neighbors and UMAP Computations
    sc.pp.neighbors(sampleHalf2, n_neighbors=50, n_pcs =50, use_rep = "Scanorama")
    sc.tl.umap(sampleHalf2) 

    # Re-calculating cluster numbers 
    sc.tl.leiden(sampleHalf2, resolution = 0.3, key_added = "leiden_0.3")
    
    # Calculates nuclei and gene information specific to the sample object 
    sc.pp.calculate_qc_metrics(sampleHalf2, percent_top=None, log1p=False, inplace=True)
    
    # Saving each sample half as an individual AnnData Object 
    sampleHalf1.write_h5ad(f'F:/SampleData/SampleHalves/{sampleName}_Half1_Seed{seedUsed}.h5ad')
    sampleHalf2.write_h5ad(f'F:/SampleData/SampleHalves/{sampleName}_Half2_Seed{seedUsed}.h5ad')

# Conatenated Samples

In [10]:
'''CREATING ANNDATA OBJECT FILES FOR CONCATENATED BIOLOGICAL REPLICATE SAMPLES'''

creatingConcatSampleFiles(bioRepComparisonLst, 'BiologicalReplicaSamples')

In [15]:
'''CREATING ANNDATA OBJECT FILES FOR CONCATENATED PLASTICITY COMPARISONS'''

creatingConcatSampleFiles(plastComparLst, 'PlasticityComparisons')

In [12]:
'''CREATING ANNDATA OBJECT FILES FOR CONCATENATED HEMISPHERIC COMPARISONS'''

creatingConcatSampleFiles(hemComparLst, 'HemisphericComparisons')