# Assembling an anndata object
**This new anndata object will be assembled to be used in the velocity analysis**

# A: loading packages

In [1]:
import os
from os import path
import scanpy as sc
import pandas as pd
from scipy import sparse
from scipy.io import mmread
import anndata
from anndata import AnnData

# B: set up

In [2]:
#####################DEFINING TEST NUMBER##############################
test="Test1"

##########################DEFINING IMPUTS##############################
path_imput="/mnt/workspace/gvalent/5_SC_automatation/data1/raw_data" #This is the directory where the preprocessed data is located

#To create the anndata object for velocity analyis, paste bellow the sample descriptions of 10X.yml
the_10X_yml=[    
"sample1:condition:room_air",
"sample2:condition:room_air",
"sample3:condition:smooke_exposure",
"sample4:condition:smooke_exposure"
]

#Which Solo dataset do you wanna use?
data_type='filtered' #set 'raw' or 'filtered'

# C: checking and creating output directory

In [3]:
def error_message(DIRE):
    print("The " + DIRE + "\nis wrong or not found.\n")

def dir_creator(DIREC):
    print("Creating the " + DIREC + " directory.")
    os.mkdir(DIREC)
##################################

#Checking the correct set of imput directory and if the files exist to assembly the anndata object for velocyt analysis
def checker_path(SAMPLE, GENVELO):
    global path_imput
    global data_type
    return path.exists(path_imput + "/quant/" + SAMPLE + "/solo/" + GENVELO + "/" + data_type)

def listing(DIR):
    global list_words
    for b in os.listdir(DIR):
        if b not in list_words:
            list_words.append(b)

list_data=["barcodes.tsv", "genes.tsv", "matrix.mtx", "spliced.mtx", "unspliced.mtx", "ambiguous.mtx"]
if path.exists(path_imput): #Check if the main directory exist
        for a in the_10X_yml: #Checking the presence of all sample directories
            list_words=[]
            list_data_not_found=[]
            sample=a.split(":")[0]
            if checker_path(sample, "Gene") == True and checker_path(sample, "Velocyto") == True: #Checking the existence of quant/sampleX/solo/Gene/ or Velocyto
                listing(path_imput + "/quant/" + sample + "/solo/Gene/" + data_type)
                listing(path_imput + "/quant/" + sample + "/solo/Velocyto/" + data_type)
                for c in list_data: #Checking if Gene and Velocyto data_type have the proper data (.tsv and .mtx)
                    if c not in list_words:
                        list_data_not_found.append(c)
            else:
                error_message(path_imput + "/quant/" + sample + "/solo/Gene/" + data_type)
                print("OR\n")
                error_message(path_imput + "/quant/" + sample + "/solo/Velocyto/" + data_type)
            if len(list_data_not_found) == 0:
                path_data_vel=path_imput + "/quant" #Seting final pathway where the sampleX/solo is located.
            else:
                print("These " + str(list_data_not_found) + " data is missing in Gene or Velocyto directories.")
else:
    error_message(path_imput)

#Checking whether the results dir exist
path1=path_imput + "/results"
path2=path_imput + "/results/" + test
if path.exists(path1): #Check if result dir exist
    if path.exists(path2):  #Check if result/Test* dir exist
        print("The " + path2 + " directory is read.")
        path_results=path2
    else:
        dir_creator(path2)
else:
    dir_creator(path1)
    if path.exists(path2):  #Check if result/Test* dir exist
        print("The " + path2 + " directory is read.")
        path_results=path2
    else:
        dir_creator(path2)

Creating the /mnt/workspace/gvalent/5_SC_automatation/data1/raw_data/results directory.
Creating the /mnt/workspace/gvalent/5_SC_automatation/data1/raw_data/results/Test1 directory.


# D: assembling anndata

In [4]:
#Creating or only loading the anndata object for velocity analysis
#Provided by Philipp. See also: https://github.com/alexdobin/STAR/issues/774 
#Function to load data
print("Creating the anndata object for velocyte analysis.")
def buildvdata(SAMPLE):
    global conditions_name
    path1=path_data_vel + "/" + SAMPLE + "/solo/Velocyto/"
    path2=path_data_vel + "/" + SAMPLE + "/solo/Gene/" + data_type
    path_for_X_spl_unpl_ambig=path1 + data_type
    path_for_obs=path1 + data_type + '/barcodes.tsv'
    path_for_var=path1 + data_type + '/genes.tsv'
    print("\t\tLoading matrix to compose the X object: " + SAMPLE)
    X = sc.read_mtx(path2 + '/matrix.mtx')
    X = X.X.transpose()
    print("\t\tLoading genes and cells identifiers to make the obs object: " + SAMPLE)
    obs = pd.read_csv(path_for_obs, header = None, index_col = 0)
    obs.index.name = None #Remove index column name to make it compliant with the anndata format
    obs = obs + '-' + str(''.join(conditions_name))
    print("\t\tLoading the gene features to make the var object: " + SAMPLE)
    var = pd.read_csv(path_for_var, sep='\t', names = ('gene_ids', 'feature_types'), index_col = 1)
    print("\t\tLoading spliced, unspliced and ambigous matrix to compose the X object: " + SAMPLE)
    spliced = sparse.csr_matrix(mmread(path_for_X_spl_unpl_ambig + '/spliced.mtx')).transpose()
    unspliced  = sparse.csr_matrix(mmread(path_for_X_spl_unpl_ambig + '/unspliced.mtx')).transpose()
    ambiguous  = sparse.csr_matrix(mmread(path_for_X_spl_unpl_ambig + '/ambiguous.mtx')).transpose()
    print("\t\tCreating partial anndata object: " + SAMPLE)
    adata = anndata.AnnData(X = X, obs = obs, var = var, layers = {'spliced': spliced, 'unspliced': unspliced, 'ambiguous': ambiguous})
    adata.var_names_make_unique()
    return adata.copy()

#Executing the function
adata_list=list()
conditions_name=[]
dict_rename_samples={}
timer=0
for a in the_10X_yml:
    print("Runing sample number " + str(timer + 1) + " out of " + str(len(the_10X_yml)) + " samples.")
    sample=a.split(":")[0]
    condition=a.split(":")[1]
    condition_description=a.split(":")[2]
    dict_rename_samples[str(timer)]=condition_description
    if condition not in conditions_name:
        conditions_name.append(condition)
    adata_list.append(buildvdata(sample))
    timer=timer+1

#Creating the final anndata and saving
print("Creating the final anndata object.")
print("\t\tConcatenating objects.")
adata = adata_list[0].concatenate(adata_list[1:])
print("\t\tRenaming batches.")
adata.obs["batch"].replace(dict_rename_samples, inplace=True)
adata.obs.rename(columns = {"batch": ''.join(conditions_name)}, inplace = True)
print("\t\tSaving and loading.")
adata_output= path2 + "/anndata_1_" + test +".h5ad"
adata.write(filename=adata_output) #SAVIND THE ADATA FILE WITH SPLICED UNSPLICED; AMBIGOUS COUNTINGS

#Loading adata file and printing num cells and num genes
print("Loading the anndata for velocity and storing as an adata variable.")
adata = sc.read_h5ad(filename=adata_output)
display(adata)

Creating the anndata object for velocyte analysis.
Runing sample number 1 out of 4 samples.
		Loading matrix to compose the X object: sample1
		Loading genes and cells identifiers to make the obs object: sample1
		Loading the gene features to make the var object: sample1
		Loading spliced, unspliced and ambigous matrix to compose the X object: sample1


Variable names are not unique. To make them unique, call `.var_names_make_unique`.


		Creating partial anndata object: sample1
Runing sample number 2 out of 4 samples.
		Loading matrix to compose the X object: sample2
		Loading genes and cells identifiers to make the obs object: sample2
		Loading the gene features to make the var object: sample2
		Loading spliced, unspliced and ambigous matrix to compose the X object: sample2


Variable names are not unique. To make them unique, call `.var_names_make_unique`.


		Creating partial anndata object: sample2
Runing sample number 3 out of 4 samples.
		Loading matrix to compose the X object: sample3
		Loading genes and cells identifiers to make the obs object: sample3
		Loading the gene features to make the var object: sample3
		Loading spliced, unspliced and ambigous matrix to compose the X object: sample3


Variable names are not unique. To make them unique, call `.var_names_make_unique`.


		Creating partial anndata object: sample3
Runing sample number 4 out of 4 samples.
		Loading matrix to compose the X object: sample4
		Loading genes and cells identifiers to make the obs object: sample4
		Loading the gene features to make the var object: sample4
		Loading spliced, unspliced and ambigous matrix to compose the X object: sample4


Variable names are not unique. To make them unique, call `.var_names_make_unique`.


		Creating partial anndata object: sample4
Creating the final anndata object.
		Concatenating objects.


... storing 'condition' as categorical
... storing 'feature_types' as categorical


		Renaming batches.
		Saving and loading.
Loading the anndata for velocity and storing as an adata variable.


AnnData object with n_obs × n_vars = 40377 × 55359
    obs: 'condition'
    var: 'gene_ids', 'feature_types'
    layers: 'ambiguous', 'spliced', 'unspliced'

# E: saving information

In [5]:
if "information.txt" not in os.listdir(path2):
    displayoutput=str(adata)
    information="Test:" + test + "\nPath:" + path2 + "\n##########\n" + "1_assembling_anndata\n" "\nSample_description:\n" + '\n'.join(the_10X_yml) + "\nOutput_1:anndata_1_" + test + ".h5ad" + "\nAnndata:\n" + displayoutput
    print(information, file=open(path2 + "/information.txt", "a"))
else:
    displayoutput=str(adata)
    information="Test:" + test + "\nPath:" + path2 + "\n##########\n" + "1_assembling_anndata\n" "\nSample_description:\n" + '\n'.join(the_10X_yml) + "\nOutput_1:anndata_1_" + test + ".h5ad" + "\nAnndata:\n" + displayoutput
    print(information, file=open(path2 + "/information.txt", "w"))