# CONCATENATING THE DATA |   Apr 9th 2024

In [79]:
import pandas as pd
import numpy as np
import scanpy as sc
import os
import glob

# 1. Creating dataframe with file and folder names

In [80]:

# 1. Folders and files
folders_path = "/group/testa/Project/CBOProgPools/data/*"
files_path = "/group/testa/Project/CBOProgPools/h5ad/*.h5ad"

# 2. Get folders
folders = sorted([os.path.basename(folder) for folder in glob.glob(folders_path)])
# Get files
files = sorted([os.path.basename(file) for file in glob.glob(files_path)])

# 3. Create a DataFrame
df = pd.DataFrame({'Folder': folders, 'File': files})
df['Old_Folder'] = df['Folder']
df['Old_File'] = df['File']
df['Folder'] = ['data' + str(i+1) for i in range(len(folders))]
df['File'] = ['file' + str(i+1) + '.h5ad' for i in range(len(files))]
df


Unnamed: 0,Folder,File,Old_Folder,Old_File
0,data1,file1.h5ad,CS14_3_cortex,CS14_01_20240208_v1.h5ad
1,data2,file2.h5ad,CS14_cortex,CS14_02_20240208_v1.h5ad
2,data3,file3.h5ad,CS15_2cortex,CS15_01_20240208_v1.h5ad
3,data4,file4.h5ad,CS19_cortex,CS19_01_20240208_v1.h5ad
4,data5,file5.h5ad,CS22_2_PFC,CS22_01_20240208_v1.h5ad
5,data6,file6.h5ad,CS22_PFC,CS22_02_20240208_v1.h5ad
6,data7,file7.h5ad,GW15_PFC_CP,GW15_01_20240208_v1.h5ad
7,data8,file8.h5ad,GW15_PFC_VZ,GW15_02_20240208_v1.h5ad
8,data9,file9.h5ad,GW15_wholePFC,GW15_03_20240208_v1.h5ad
9,data10,file10.h5ad,GW16_PFC,GW16_01_20240208_v1.h5ad


In [3]:
# 4. cleaning the memory 
import gc # garbage collector
gc.collect()

0

# 2. Dataframes CLEANED DATA 

In [27]:
data_frames = []


# Iterate over each row in the DataFrame
for i, row in df.iterrows():
    # Construct the full file path
    file_path = os.path.join("/group/testa/Project/CBOProgPools/h5ad/", row['Old_File'])
    
    # Read the .h5ad file
    df_temp = sc.read_h5ad(file_path)
        
    # Assign the DataFrame to a dynamically created variable
    globals()[f"df{i+1}"] = df_temp
    
    # Store the DataFrame in the list
    data_frames.append(df_temp)

    # Release memory for the temporary DataFrame
    del df_temp

# Print the dynamically created DataFrame names
print(data_frames[0])


AnnData object with n_obs × n_vars = 7927 × 16708
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'S_score', 'G2M_score', 'phase', 'doublet_score', 'predicted_doublet', 'leiden', 'celltype', 'celltype2', 'stage'
    var: 'gene_ids', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'triku_distance', 'triku_distance_uncorrected', 'triku_highly_variable'
    uns: 'celltype2_colors', 'celltype_colors', 'dendrogram_leiden', 'diffmap_evals', 'leiden', 'leiden_colors', 'leiden_sizes', 'neighbors', 'paga', 'pca', 'phase_colors', 'rank_genes_groups', 'scrublet', 'triku_params', 'umap'
    obsm: 'X_diffmap', 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'Raw', 'logNorm', 'normalized'
    obsp: 'connectivities', 'distances'


# 3. Dataframes RAW DATA 

In [28]:
# Iterate over each row in the DataFrame

raw_dataframes = []
for i, row in df.iterrows():
    # Construct the full file path
    folder_path = os.path.join("/group/testa/Project/CBOProgPools/data/", row['Old_Folder'])
    
    # Read the raw data floder
    df_temp_raw = sc.read_10x_mtx(folder_path)
        
    # Assign the DataFrame to a dynamically created variable
    globals()[f"df{i+1}_raw"] = df_temp_raw
    
    # Store the DataFrame in the list
    raw_dataframes.append(df_temp_raw)

    # Release memory for the temporary DataFrame
    del df_temp_raw




AnnData object with n_obs × n_vars = 7927 × 33694
    var: 'gene_ids'


In [29]:
print(raw_dataframes)

[AnnData object with n_obs × n_vars = 7927 × 33694
    var: 'gene_ids', AnnData object with n_obs × n_vars = 1419 × 33694
    var: 'gene_ids', AnnData object with n_obs × n_vars = 9945 × 33694
    var: 'gene_ids', AnnData object with n_obs × n_vars = 1349 × 33694
    var: 'gene_ids', AnnData object with n_obs × n_vars = 621 × 33694
    var: 'gene_ids', AnnData object with n_obs × n_vars = 1945 × 33694
    var: 'gene_ids', AnnData object with n_obs × n_vars = 68902 × 33694
    var: 'gene_ids', AnnData object with n_obs × n_vars = 68344 × 33694
    var: 'gene_ids', AnnData object with n_obs × n_vars = 69512 × 33694
    var: 'gene_ids', AnnData object with n_obs × n_vars = 12445 × 33694
    var: 'gene_ids', AnnData object with n_obs × n_vars = 12429 × 33694
    var: 'gene_ids', AnnData object with n_obs × n_vars = 11968 × 33694
    var: 'gene_ids', AnnData object with n_obs × n_vars = 15131 × 33694
    var: 'gene_ids', AnnData object with n_obs × n_vars = 14041 × 33694
    var: 'gene_ids'

# 4.Create df1, df2... df31 individually for all 31 data ( with all the obs ) 

In [7]:
# LOOP TO DO IT WITH ALL DATASETS
for i in range(1, 32):
    # Get the DataFrame corresponding to df{i}
    df_name = f"df{i}"
    df = globals()[df_name]
    
    # Use the observation data directly without copying
    globals()[f"tmp_{i}"] = df.obs
    
    # Release memory by deleting the DataFrame object
    del df

#print(tmp_1)


# 5. Taking only 3 obs and saving them into a .csv

In [11]:

# 1. Folders and files
folders_path = "/group/testa/Project/CBOProgPools/data/*"
files_path = "/group/testa/Project/CBOProgPools/h5ad/*.h5ad"

# 2. Get folders
folders = sorted([os.path.basename(folder) for folder in glob.glob(folders_path)])
# Get files
files = sorted([os.path.basename(file) for file in glob.glob(files_path)])

# 3. Create a DataFrame
df = pd.DataFrame({'Folder': folders, 'File': files})
df['Old_Folder'] = df['Folder']
df['Old_File'] = df['File']
df['Folder'] = ['data' + str(i+1) for i in range(len(folders))]
df['File'] = ['file' + str(i+1) + '.h5ad' for i in range(len(files))]



In [12]:

for i, row in df.iterrows():
    # Extract the file name without extension
    file_name = row['Old_File'].split('_2')[0]
    
    # Extract the date
    date = '20240416'
    
    # Create a base file name
    base_file_name = f"{file_name}_{date}_Obs"
    
    # Get the DataFrame corresponding to df{i}
    df_name = f"df{i+1}"
    df = globals()[df_name]
    csv_file_name = f"{base_file_name}.csv"
    
    # Extract observation variables
    obs_data = df.obs[['phase', 'stage', 'celltype2']]
    
    # Save all observation variables to a single CSV file
    obs_data.to_csv(csv_file_name, index_label='index')
    
    # Release memory by deleting the DataFrame object
    del df


In [23]:
#example = pd.read_csv("CS14_01_20240416_Obs.csv")
example[:10]

Unnamed: 0,index,phase,stage,celltype2
0,AAACCTGAGACCGGAT-1,G1,CS14,MigExc
1,AAACCTGAGATAGCAT-1,G1,CS14,MatExc
2,AAACCTGAGTAGCGGT-1,G1,CS14,oRG
3,AAACCTGAGTCACGCC-1,G1,CS14,MigExc
4,AAACCTGAGTCCCACG-1,G1,CS14,MigExc
5,AAACCTGCAACGATCT-1,G2M,CS14,MigExc
6,AAACCTGCACACTGCG-1,S,CS14,vRG
7,AAACCTGCACTTGGAT-1,S,CS14,vRG
8,AAACCTGCAGACTCGC-1,G2M,CS14,oRG
9,AAACCTGGTAAGAGAG-1,S,CS14,MigExc


#

In [31]:
data_frames[1]

AnnData object with n_obs × n_vars = 1376 × 11494
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'S_score', 'G2M_score', 'phase', 'doublet_score', 'predicted_doublet', 'leiden', 'celltype', 'celltype2', 'stage'
    var: 'gene_ids', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'triku_distance', 'triku_distance_uncorrected', 'triku_highly_variable'
    uns: 'celltype2_colors', 'celltype_colors', 'dendrogram_leiden', 'diffmap_evals', 'leiden', 'leiden_colors', 'leiden_sizes', 'neighbors', 'paga', 'pca', 'phase_colors', 'rank_genes_groups', 'scrublet', 'triku_params', 'umap'
    obsm: 'X_diffmap', 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'Raw', 'logNorm', 'normalized'
    obsp: 'connectivities', 'distances'

In [32]:
raw_dataframes[1]

AnnData object with n_obs × n_vars = 1419 × 33694
    var: 'gene_ids'

In [None]:
all_obs[1]

# SINGLE LOOP TO DO EVERYTHING!!!

In [81]:
# 1. Folders and files
folders_path = "/group/testa/Project/CBOProgPools/data/*"
files_path = "/group/testa/Project/CBOProgPools/h5ad/*.h5ad"

# 2. Get folders
folders = sorted([os.path.basename(folder) for folder in glob.glob(folders_path)])
# Get files
files = sorted([os.path.basename(file) for file in glob.glob(files_path)])

# 3. Create a DataFrame
df = pd.DataFrame({'Folder': folders, 'File': files})
df['Old_Folder'] = df['Folder']
df['Old_File'] = df['File']
df['Folder'] = ['data' + str(i+1) for i in range(len(folders))]
df['File'] = ['file' + str(i+1) + '.h5ad' for i in range(len(files))]


In [82]:
df = pd.read_csv("/group/testa/Project/CBOProgPools/FolderAnnData.csv",header=None)

In [83]:
#df = df[4:10]

In [84]:
# Iterate over each row in the DataFrame
for i, row in df.iterrows():
    # Construct the full file path
    file_path = os.path.join("/group/testa/Project/CBOProgPools/h5ad/", row[1])
    
    # Read the .h5ad file
    df_temp = sc.read_h5ad(file_path)
    
    folder_path = os.path.join("/group/testa/Project/CBOProgPools/data/", row[0])
    print(row[1])
    # Read the raw data floder
    df_temp_raw = sc.read_10x_mtx(folder_path)
    
    df_temp_raw_filtered = df_temp_raw[df_temp_raw.obs_names.isin(df_temp.obs_names)].copy()
    df_temp_raw_filtered.obs = df_temp.obs[['phase', 'stage', 'celltype2']].copy()
    
    file_name = row[1].split('_2')[0]
    
    # Extract the date
    date = '20240416'
    
    # Create a base file name
    base_file_name = f"{file_name}_{date}_raw_filtered"
    
    df_name = f"df{i+1}"
    h5ad_file_name = f"{base_file_name}.h5ad"
    df_temp_raw_filtered.write_h5ad(h5ad_file_name)
    
    

CS14_01_20240208_v1.h5ad
CS14_02_20240208_v1.h5ad
CS15_01_20240208_v1.h5ad
CS19_01_20240208_v1.h5ad
CS22_01_20240208_v1.h5ad
CS22_02_20240208_v1.h5ad
GW15_01_20240208_v1.h5ad
GW15_02_20240208_v1.h5ad
GW15_03_20240208_v1.h5ad
GW16_01_20240208_v1.h5ad
GW16_02_20240209_v1.h5ad
GW17_01_20240209_v1.h5ad
GW18_01_20240209_v1.h5ad
GW19_01_20240209_v1.h5ad
GW19_02_20240209_v1.h5ad
GW19_03_20240209_v1.h5ad
GW20_01_20240209_v1.h5ad
GW20_02_20240209_v1.h5ad
GW20_03_20240209_v1.h5ad
GW20_04_20240215_v1.h5ad
GW22_01_20240215_v1.h5ad
GW22_02_20240215_v1.h5ad
GW22_03_20240215_v1.h5ad
GW22_04_20240215_v1.h5ad
GW22_05_20240215_v1.h5ad
GW22_06_20240215_v1.h5ad
GW25_01_20240216_v1.h5ad
GW25_02_20240216_v1.h5ad
GW25_03_20240216_v1.h5ad
GW25_04_20240216_v1.h5ad
GW25_05_20240216_v1.h5ad
