In [10]:
#Author: Neda R. Mehdiabadi
#Date: 06.01.2024
#Convert h5 to filtered loom
import h5py
import os 
import numpy as np
import loompy as lp
from scipy import sparse
import scanpy as sc
import collections
import pandas as pd

In [11]:
#read metadata csv files
current_dir = os.getcwd()
print(f"The current working directory is: {current_dir}")
loom_col_attr_name = pd.read_csv('../DCM-Foundation-Model-data/Endo_inference/loom_col_attr_name.csv')
loom_col_attr_name = loom_col_attr_name.fillna("NaN") #Replace NaN values with the string 'NaN' for all columns
loom_col_attr_name_dict = loom_col_attr_name.to_dict(orient='records')
print(loom_col_attr_name_dict[2])
print(len(loom_col_attr_name_dict))  #165
datasets_metadata = pd.read_csv('../DCM-Foundation-Model-data/Endo_inference/Endo_metadata_pediatricPlusAdult_Geneformer.csv',dtype={7: str})
print(datasets_metadata.head())
print(len(datasets_metadata)) #361913
ann = pd.read_csv('../DCM-Foundation-Model-data/Endo_inference/ann.csv')  #ann.shape: (21684, 5)
print(ann.head())

The current working directory is: /data/mcri_heartv2/mcri_nedar/DCM-Foundation-Model
{'file_name': 'Seidman_BO_H39_LVW_Adult_DCM', 'individual': 'BO_H39_LVW', 'disease': 'DCM', 'sex': 'Male', 'age': 60, 'dataset': 'Seidman', 'group': 'Adult', 'genotype': 'TTN', 'cell_type': 'Endo', 'dir': '/data/mcri_heartv2/mcri_nedar/HeartV2_files/cellRanger_cellBender/Seidman/Seidman_BO_H39_LVW_Adult_DCM/cellbender_output_filtered.h5'}
61
        endo_barcodes                  sample dataset condition  \
0  CGGCTAGCAAGGTTCT-1  Lavine_H_ZC-LVAD-1_DCM  Lavine       DCM   
1  AAGACCTTCCTATTCA-1  Lavine_H_ZC-LVAD-1_DCM  Lavine       DCM   
2  CCTTCGACAGATAATG-1  Lavine_H_ZC-LVAD-1_DCM  Lavine       DCM   
3  CAAGGCCTCCAAGCCG-1  Lavine_H_ZC-LVAD-1_DCM  Lavine       DCM   
4  GTCAAGTGTTGTTTGG-1  Lavine_H_ZC-LVAD-1_DCM  Lavine       DCM   

   harmony_clusters    cell_type            patient               file_name  
0                 6  Endothelial  Lavine_H_ZCAD_DCM  Lavine_H_ZC-LVAD-1_DCM  
1           

In [12]:
# Remove non-informative and X|Y genes from ann
mito_indices = ann[ann['name'].str.contains("mitochondrial", case=False, na=False)].index.tolist()
print("Number of mitochondrial entries:", len(mito_indices))
ribo_indices = ann[ann['name'].str.contains("ribosomal", case=False, na=False)].index.tolist()
print("Number of ribosomal entries:", len(ribo_indices))
missing_ezid_indices = ann[ann['entrez_id'].isna()].index.tolist()
print("Number of missing ENTREZID entries:", len(missing_ezid_indices))
missing_ensemblid_indices = ann[ann['ensemblid'].isna()].index.tolist()
print("Number of missing ensembl entries:", len(missing_ensemblid_indices))
MALAT1_indice = ann[ann['symbol'].str.contains("MALAT1", case=False, na=False)].index.tolist()
print("Number of MALAT1 entries:", len(MALAT1_indice))
xy_indices = ann[ann['chr'].str.contains("X|Y", case=False, na=False)].index.tolist()
print("Number of X|Y entries:", len(missing_ezid_indices))
combined_indices = set(mito_indices + ribo_indices + missing_ezid_indices + missing_ensemblid_indices + MALAT1_indice + xy_indices)
print("Number of non-informative and X|Y combined entries:", len(combined_indices))
'''
Number of mitochondrial entries: 223
Number of ribosomal entries: 196
Number of missing ENTREZID entries: 5276
Number of missing ENSEMBL entries: 5830
Number of MALAT1 entries: 1
Number of X|Y entries: 5276
Number of non-informative and X|Y combined entries: 6757
'''
print(ann.shape) #(22616, 5)
ann_filtered = ann.drop(index=combined_indices)
print(ann_filtered.shape) #(15859, 5)
ann_filtered_reset = ann_filtered.reset_index(drop=True)
ensemblid_dict = ann_filtered_reset['symbol'].to_dict()

Number of mitochondrial entries: 223
Number of ribosomal entries: 196
Number of missing ENTREZID entries: 5276
Number of missing ensembl entries: 5830
Number of MALAT1 entries: 1
Number of X|Y entries: 5276
Number of non-informative and X|Y combined entries: 6757
(22616, 5)
(15859, 5)


In [13]:
print(loom_col_attr_name_dict[0])
print(loom_col_attr_name_dict[0]["individual"])
print(loom_col_attr_name_dict[0]["dir"])
print(datasets_metadata["file_name"])

{'file_name': 'Seidman_HCAHeart8287124_Adult_Control', 'individual': 'HCAHeart8287124', 'disease': 'Control', 'sex': 'NaN', 'age': 50, 'dataset': 'Seidman', 'group': 'Adult', 'genotype': 'NaN', 'cell_type': 'Endo', 'dir': '/data/mcri_heartv2/mcri_nedar/HeartV2_files/cellRanger_cellBender/Seidman/Seidman_HCAHeart8287124_Adult_Control/cellbender_output_filtered.h5'}
HCAHeart8287124
/data/mcri_heartv2/mcri_nedar/HeartV2_files/cellRanger_cellBender/Seidman/Seidman_HCAHeart8287124_Adult_Control/cellbender_output_filtered.h5
0               Lavine_H_ZC-LVAD-1_DCM
1               Lavine_H_ZC-LVAD-1_DCM
2               Lavine_H_ZC-LVAD-1_DCM
3               Lavine_H_ZC-LVAD-1_DCM
4               Lavine_H_ZC-LVAD-1_DCM
                      ...             
203841    Seidman_IC_H02_LV0_Young_DCM
203842    Seidman_IC_H02_LV0_Young_DCM
203843    Seidman_IC_H02_LV0_Young_DCM
203844    Seidman_IC_H02_LV0_Young_DCM
203845    Seidman_IC_H02_LV0_Young_DCM
Name: file_name, Length: 203846, dtype: object

In [14]:
FeatureBCMatrix = collections.namedtuple('FeatureBCMatrix', ['ensembl_id', 'feature_names', 'feature_type','feature_genome', 'feature_entrezid', 'feature_desc', 'feature_chr', 'barcodes', 'group', 'individual', 'sex', 'age', 'dataset', 'disease', 'genotype', 'cell_type', 'n_counts', 'genes_per_cell', 'matrix'])

In [15]:
def get_matrix_from_h5(filename, metadata):
    with h5py.File(filename) as f:
        ensembl_id = [x.decode('ascii', 'ignore') for x in f['matrix']['features']['id']]
        feature_names = [x.decode('ascii', 'ignore') for x in f['matrix']['features']['name']]        
        feature_type = [x.decode('ascii', 'ignore') for x in f['matrix']['features']['feature_type']]
        feature_genome = [x.decode('ascii', 'ignore') for x in f['matrix']['features']['genome']]
        feature_entrezid = [''] * f['matrix']['shape'][0]
        feature_desc = [''] * f['matrix']['shape'][0]
        feature_chr = [''] * f['matrix']['shape'][0]
        byte_strings_barcodes = f['matrix']['barcodes']
        barcodes = [bs.decode('utf-8') for bs in byte_strings_barcodes] #removes 'b' which represents byte
        matrix = sparse.csc_matrix((f['matrix']['data'], f['matrix']['indices'], f['matrix']['indptr']), shape=f['matrix']['shape'])
        print(matrix.shape)
        group = [metadata.get('group')] * f['matrix']['shape'][1]
        individual = [metadata.get('individual')] * f['matrix']['shape'][1]
        sex = [metadata.get('sex')] * f['matrix']['shape'][1]
        age = [metadata.get('age')] * f['matrix']['shape'][1]
        dataset = [metadata.get('dataset')] * f['matrix']['shape'][1]
        disease = [metadata.get('disease')] * f['matrix']['shape'][1]
        genotype = [metadata.get('genotype')] * f['matrix']['shape'][1]
        cell_type = [''] * f['matrix']['shape'][1]
        n_counts = np.zeros(f['matrix']['shape'][1]).tolist()
        genes_per_cell = np.zeros(f['matrix']['shape'][1]).tolist()
        
        return FeatureBCMatrix(ensembl_id, feature_names, feature_type, feature_genome, feature_entrezid, feature_desc, feature_chr, barcodes, group, individual, sex, age, dataset, disease, genotype , cell_type, n_counts, genes_per_cell, matrix)
        
def get_expression(fbm, gene_name, barcode):
    try:
        gene_index = fbm.feature_names.index(gene_name)
        barcode_index = fbm.barcodes.index(barcode)
    except ValueError:
        raise Exception("%s was not found in list of gene names." % gene_name)
    #return fbm.matrix[gene_index, barcode_index].toarray().squeeze()
    return fbm.matrix[gene_index, barcode_index].squeeze()

In [16]:
def get_expression_np(fbm, gene_name, barcode):
        gene_index = np.where(fbm.feature_names == gene_name)[0][0]  # Use np.where for NumPy arrays
        print(gene_index)
        barcode_index = fbm.barcodes.index(barcode)
        print(barcode_index)
        expression_value = fbm.matrix[gene_index, barcode_index]
        return expression_value

In [17]:
def get_expression_np2(fbm, gene_name, barcode):
    try:
        gene_index = np.where(fbm.feature_names == gene_name)[0][0]
        print(f"Gene index for {gene_name}: {gene_index}")
        barcode_index = np.where(fbm.barcodes == barcode)[0][0]
        print(f"Barcode index for {barcode}: {barcode_index}")
        expression_value = fbm.matrix[gene_index, barcode_index]
        print(f"Expression value for gene '{gene_name}' and barcode '{barcode}': {expression_value}")
        return expression_value
    except IndexError:
        print(f"Gene '{gene_name}' or barcode '{barcode}' not found in the dataset.")
        return None

In [18]:
for entry in loom_col_attr_name_dict:
    print(entry["dir"])
    feature_bc_matrix = get_matrix_from_h5(entry["dir"],entry)
    #print(feature_bc_matrix.barcodes)
    exp_fbm = get_expression(feature_bc_matrix,"TNNT2",datasets_metadata[datasets_metadata["file_name"] == entry["file_name"]]["endo_barcodes"].iloc[17])
    print(exp_fbm) #14
    
    #Remove non-informative and X|Y genes from feature_bc_matrix
    feature_bc_matrix_symbol = pd.Series(feature_bc_matrix.feature_names)
    def find_index(val):
        return next((i for i, (key, value) in enumerate(feature_bc_matrix_symbol.items()) if value == val), None)
    index = ann_filtered_reset['symbol'].map(find_index)
    valid_indices = index.dropna().astype(int)
    print(len(valid_indices)) #15859
    print(valid_indices[50:61])
    
    #Checking values are matching 
    selected_matrix = feature_bc_matrix.matrix[np.array(valid_indices),:]
    selected_symbols = np.array(feature_bc_matrix.feature_names)[np.array(valid_indices)]
    print(feature_bc_matrix_symbol[valid_indices[valid_indices.index[50]]]) #PLEKHG5
    print(selected_symbols[50]) #PLEKHG5
    print(selected_matrix.shape) #(15859, 13187)
    print(feature_bc_matrix.matrix[[valid_indices[valid_indices.index[50]]],:10].toarray())
    print(selected_matrix[50,:10].toarray())
    print(ann_filtered_reset.iloc[50,])
    
    #Generate updated fbm w/ non-informative and X|Y genes removed
    selected_matrix = feature_bc_matrix.matrix[np.array(valid_indices),:]
    selected_ensembl_ids = np.array(feature_bc_matrix.ensembl_id)[np.array(valid_indices)]
    selected_feature_names = np.array(feature_bc_matrix.feature_names)[np.array(valid_indices)]
    selected_feature_type = np.array(feature_bc_matrix.feature_type)[np.array(valid_indices)]
    selected_feature_genome = np.array(feature_bc_matrix.feature_genome)[np.array(valid_indices)]
    selected_feature_entrezid = ann_filtered_reset.loc[index.dropna().index, 'entrez_id']
    selected_feature_desc = ann_filtered_reset.loc[index.dropna().index, 'name']
    selected_feature_chr = ann_filtered_reset.loc[index.dropna().index, 'chr']

    selected_feature_bc_matrix = FeatureBCMatrix(ensembl_id = selected_ensembl_ids, feature_names = selected_feature_names, feature_type = selected_feature_type, feature_genome = selected_feature_genome, feature_entrezid = selected_feature_entrezid, feature_desc = selected_feature_desc, feature_chr = selected_feature_chr, barcodes = feature_bc_matrix.barcodes, group = feature_bc_matrix.group, individual = feature_bc_matrix.individual, sex = feature_bc_matrix.sex, age = feature_bc_matrix.age, dataset = feature_bc_matrix.dataset, disease = feature_bc_matrix.disease, genotype = feature_bc_matrix.genotype, cell_type = feature_bc_matrix.cell_type, n_counts = feature_bc_matrix.n_counts, genes_per_cell= feature_bc_matrix.genes_per_cell, matrix = selected_matrix)
    
    exp_selectedfbm = get_expression_np(selected_feature_bc_matrix,'TNNT2',datasets_metadata[datasets_metadata["file_name"] == entry["file_name"]]["endo_barcodes"].iloc[17])
    print(exp_selectedfbm)
    print(selected_feature_bc_matrix.matrix[50,:10].toarray()) #[[2 2 0 0 0 0 1 0 0 0]]
    print(selected_feature_bc_matrix.feature_entrezid.iloc[50]) #57449.0
    print(selected_feature_bc_matrix.matrix.shape) #(15859, 5389)
    print(selected_feature_bc_matrix.genotype[1])
    print(selected_feature_bc_matrix.individual[1])
    
    #Assign cells to the individual
    index = datasets_metadata[datasets_metadata['file_name'] == entry["file_name"]].index
    print(len(index))
    print(index)
    print(datasets_metadata.iloc[index]['sample'].value_counts())
    print(datasets_metadata.iloc[index]['cell_type'].value_counts())
    print(datasets_metadata.iloc[index]['endo_barcodes'])
    feature_bc_matrix_bc = pd.Series(selected_feature_bc_matrix.barcodes)
    def find_index1(val):
        return next((i for i, (key, value) in enumerate(feature_bc_matrix_bc.items()) if value == val), None)
    index1 = datasets_metadata.iloc[index]['endo_barcodes'].map(find_index1)
    print(f'len index1 {len(index1.dropna())}')
    valid = index1.dropna().astype(int)
    #valid_reset = valid.reset_index(drop=True)
    print(valid[50:61])
    
    #Checking values are matching 
    up_matrix = selected_feature_bc_matrix.matrix[:,np.array(valid)]
    up_bc = np.array(selected_feature_bc_matrix.barcodes)[np.array(valid)]
    #print(feature_bc_matrix_bc[valid[valid.index[50]]]) #TGTTGGACACTGCACG-1
    #print(up_bc[50]) #TGTTGGACACTGCACG-1
    #print(up_matrix.shape) #(15466, 859)
    #print(selected_feature_bc_matrix.matrix[:10,[valid[valid.index[50]]]].toarray())
    #print(up_matrix[:10,50].toarray())
    
    #assign cells and cell type to the sample
    updated_matrix = selected_feature_bc_matrix.matrix[:,np.array(valid)]
    updated_barcodes = np.array(selected_feature_bc_matrix.barcodes)[np.array(valid)]
    updated_cell_type = datasets_metadata.loc[index, 'cell_type'] 

    #Calculate UMIs and genes per cell
    updated_n_counts = np.asarray(updated_matrix.sum(axis=0)).flatten()
    updated_genes_per_cell = np.asarray((updated_matrix > 0).sum(axis=0)).flatten()

    #Generate updated fbm w/ assigned cells and cell types
    updated_feature_bc_matrix = FeatureBCMatrix(
        ensembl_id=selected_feature_bc_matrix.ensembl_id, 
        feature_names=selected_feature_bc_matrix.feature_names,
        feature_type=selected_feature_bc_matrix.feature_type,
        feature_genome=selected_feature_bc_matrix.feature_genome,
        feature_entrezid=selected_feature_bc_matrix.feature_entrezid,
        feature_desc=selected_feature_bc_matrix.feature_desc,
        feature_chr=selected_feature_bc_matrix.feature_chr,
        barcodes=updated_barcodes,
        group=[entry['group']] * updated_matrix.shape[1],
        individual=[entry['individual']] * updated_matrix.shape[1],
        sex=[entry['sex']] * updated_matrix.shape[1],
        age=[entry['age']] * updated_matrix.shape[1],
        dataset=[entry['dataset']] * updated_matrix.shape[1],
        disease=[entry['disease']] * updated_matrix.shape[1],
        genotype=[entry['genotype']] * updated_matrix.shape[1],
        cell_type=updated_cell_type,
        n_counts=updated_n_counts,
        genes_per_cell=updated_genes_per_cell,
        matrix=updated_matrix
    )
    
    #Checking values are matching
    updated_feature_bc_matrix.matrix.shape #(15466, 859)
    #print([loom_col_attr_name_dict[155]['group']] * updated_matrix.shape[1])
    barcode_index = np.where(updated_feature_bc_matrix.barcodes == datasets_metadata[datasets_metadata["file_name"] == entry["file_name"]]["endo_barcodes"].iloc[17])[0][0]
    print(datasets_metadata[datasets_metadata["file_name"] == entry["file_name"]]["endo_barcodes"].iloc[17])
    gene_updatedfbm = get_expression_np2(updated_feature_bc_matrix,'TNNT2',updated_feature_bc_matrix.barcodes[barcode_index])
    print(updated_feature_bc_matrix.genotype[1])
    print(updated_feature_bc_matrix.individual[1])
    
    #Generate loom file and save it.
    ensembl_id = np.array(updated_feature_bc_matrix.ensembl_id)
    feature_names = np.array(updated_feature_bc_matrix.feature_names)
    feature_type = np.array(updated_feature_bc_matrix.feature_type)
    feature_genome = np.array(updated_feature_bc_matrix.feature_genome)
    feature_entrezid = np.array(updated_feature_bc_matrix.feature_entrezid)
    feature_desc = np.array(updated_feature_bc_matrix.feature_desc)
    feature_chr = np.array(updated_feature_bc_matrix.feature_chr)

    barcodes = np.array(updated_feature_bc_matrix.barcodes)
    group = np.array(updated_feature_bc_matrix.group)
    individual = np.array(updated_feature_bc_matrix.individual)
    sex = np.array(updated_feature_bc_matrix.sex)
    age = np.array(updated_feature_bc_matrix.age)
    dataset = np.array(updated_feature_bc_matrix.dataset)
    disease = np.array(updated_feature_bc_matrix.disease)
    genotype = np.array(updated_feature_bc_matrix.genotype)
    cell_type = np.array(updated_feature_bc_matrix.cell_type)
    n_counts = np.array(updated_feature_bc_matrix.n_counts)
    genes_per_cell = np.array(updated_feature_bc_matrix.genes_per_cell)

     # Now create the loom file

    loom_file_path = f'/data/mcri_heartv2/mcri_nedar/DCM-Foundation-Model-data/loom_file_path_Endo/{list(entry.values())[0]}.loom'
    print(f'/data/mcri_heartv2/mcri_nedar/DCM-Foundation-Model-data/loom_file_path_Endo/{list(entry.values())[0]}.loom')
    lp.create(loom_file_path, updated_feature_bc_matrix.matrix,
              row_attrs={"ensembl_id": ensembl_id, "symbol": feature_names, "type": feature_type,
                         "genome": feature_genome, "entrez_id": feature_entrezid, "description": feature_desc,
                         "chr": feature_chr},
              col_attrs={"barcodes": barcodes, "group": group, "individual": individual, "sex": sex, "age": age,
                         "dataset": dataset, "disease": disease, "genotype": genotype, "cell_type": cell_type,
                         "n_counts": n_counts, "genes_per_cell": genes_per_cell})

/data/mcri_heartv2/mcri_nedar/HeartV2_files/cellRanger_cellBender/Seidman/Seidman_HCAHeart8287124_Adult_Control/cellbender_output_filtered.h5
(36601, 365)
0
15859
50    146
51    147
52    157
53    167
54    168
55    172
56    173
57    176
58    177
59    182
60    183
Name: symbol, dtype: int64
DFFB
DFFB
(15859, 365)
[[0 0 0 0 0 0 0 0 0 0]]
[[0 0 0 0 0 0 0 0 0 0]]
symbol                                        DFFB
entrez_id                                   1677.0
ensemblid                          ENSG00000169598
name         DNA fragmentation factor subunit beta
chr                                              1
Name: 50, dtype: object
1156
295
0
[[0 0 0 0 0 0 0 0 0 0]]
1677.0
(15859, 365)
NaN
HCAHeart8287124
30
Index([59745, 59746, 59747, 59748, 59749, 59750, 59751, 59752, 59753, 59754,
       59755, 59756, 59757, 59758, 59759, 59760, 59761, 59762, 59763, 59764,
       59765, 59766, 59767, 59768, 59769, 59770, 59771, 59772, 59773, 59774],
      dtype='int64')
sample
Seidman_p12_

In [23]:
loom_file_path = f'/data/mcri_heartv2/mcri_nedar/DCM-Foundation-Model-data/loom_file_path/Patrick_LV_1547_1_nf_LV_Control.loom'

with lp.connect(str(loom_file_path)) as data:
    #ds = lp.connect(str(loom_file_path))
    print(data.ra.keys()) #['chr', 'description', 'ensembl_id', 'entrez_id', 'genome', 'symbol', 'type']
    print(data.ra['ensembl_id'].shape) #(15859,)
    print(data.ra['ensembl_id'][:]) #['ENSG00000237491' 'ENSG00000228794' 'ENSG00000223764' ... 'ENSG00000206177' 'ENSG00000169877' 'ENSG00000227674']
    print(data.ca.keys()) #['age', 'barcodes', 'cell_type', 'dataset', 'disease', 'genes_per_cell', 'genotype', 'group', 'individual', 'n_counts', 'sex']
    print(data.ca['barcodes'].shape) #(859,)
    #print(data.ca['barcodes'][:]) #['CCTTCAGGTTAGTCGT-1' 'GACTTCCGTTTACCAG-1' 'TCTAACTCATGTTCAG-1' ... 'TAGACCACAAATGAAC-1' 'TAGGAGGAGTTGTCGT-1' 'GCAACATCACCCAATA-1']
    print(data.shape) #(15466, 859)
    print(data.ra['symbol']) #['LINC01409' 'LINC01128' 'LINC02593' ... 'HBM' 'AHSP' 'LINC00355']
    print(data[data.ra['symbol'] == "TNNT2", :]) #[[ 48 106  70 ...   0   0   0]]
    print(data[data.ra['symbol'] == "LINC00623", :])
    print(data.ca['group'][:])

['chr', 'description', 'ensembl_id', 'entrez_id', 'genome', 'symbol', 'type']
(15859,)
['ENSG00000237491' 'ENSG00000228794' 'ENSG00000223764' ...
 'ENSG00000075673' 'ENSG00000253103' 'ENSG00000281560']
['age', 'barcodes', 'cell_type', 'dataset', 'disease', 'genes_per_cell', 'genotype', 'group', 'individual', 'n_counts', 'sex']
(2175,)
(15859, 2175)
['LINC01409' 'LINC01128' 'LINC02593' ... 'ATP12A' 'LINC01609' 'LSINCT5']
[[88 59 51 ... 10  5  4]]
[]
['Adult' 'Adult' 'Adult' ... 'Adult' 'Adult' 'Adult']


In [24]:
import numpy as np
import pandas as pd
import loompy as lp

# Connect to the loom file (assuming you've already done this)
with lp.connect(str(loom_file_path)) as data:
    # Extract genotype column (assuming it's stored in column attributes)
    print(data.ca['individual'])
    print(loom_file_path)
    genotype_values = data.ca['genotype']
    
    # Check the type of the value at index 292
    value_at_1 = genotype_values[1]
    print(f"Value at index 1: {value_at_1}, Type: {type(value_at_1)}")

    # Initialize list to store indices of invalid values
    invalid_values = []

    # Check for invalid values (non-string and NaN)
    for i, val in enumerate(genotype_values):
        # Print the type for debugging
        #print(f"Index: {i}, Value: {val}, Type: {type(val)}")
        
        # Check if value is not a string (assuming genotypes should be strings)
        if not isinstance(val, str):
            invalid_values.append((i, val, "Not a string"))
        # Check if value is NaN
        elif isinstance(val, (float)) and pd.isna(val):
            invalid_values.append((i, val, "NaN"))
    
    # Report invalid values if any
    if invalid_values:
        print(f"Invalid values found in 'genotype' column:")
        for idx, value, reason in invalid_values:
            print(f"Index: {idx}, Value: {value}, Reason: {reason}")
    else:
        print("All values in 'genotype' are valid strings.")


['1472_1' '1472_1' '1472_1' ... '1472_1' '1472_1' '1472_1']
/data/mcri_heartv2/mcri_nedar/DCM-Foundation-Model-data/loom_file_path/Patrick_LV_1547_1_nf_LV_Control.loom
Value at index 1: TTN, Type: <class 'numpy.str_'>
All values in 'genotype' are valid strings.


In [3]:
loom_file_path = f'/data/mcri_heartv2/mcri_nedar/DCM-Foundation-Model-data/loom_file_path/Chin_GSM4923401_Adult_Control.loom'

with lp.connect(str(loom_file_path)) as data:
    #ds = lp.connect(str(loom_file_path))
    print(data.ra.keys()) #['chr', 'description', 'ensembl_id', 'entrez_id', 'genome', 'symbol', 'type']
    print(data.ra['ensembl_id'].shape) #(15859,)
    print(data.ra['ensembl_id'][:]) #['ENSG00000237491' 'ENSG00000228794' 'ENSG00000223764' ... 'ENSG00000206177' 'ENSG00000169877' 'ENSG00000227674']
    print(data.ca.keys()) #['age', 'barcodes', 'cell_type', 'dataset', 'disease', 'genes_per_cell', 'genotype', 'group', 'individual', 'n_counts', 'sex']
    #print(data.ca['barcodes'].shape) #(859,)
    #print(data.ca['barcodes'][:]) #['CCTTCAGGTTAGTCGT-1' 'GACTTCCGTTTACCAG-1' 'TCTAACTCATGTTCAG-1' ... 'TAGACCACAAATGAAC-1' 'TAGGAGGAGTTGTCGT-1' 'GCAACATCACCCAATA-1']
    print(data.shape) #(15466, 859)
    print(data[data.ra['ensembl_id'] == "ENSG00000118194", :]) #[[ 48 106  70 ...   0   0   0]]
    print(data.ca['disease'] [:])

['chr', 'description', 'ensembl_id', 'entrez_id', 'genome', 'symbol', 'type']
(15859,)
['ENSG00000237491' 'ENSG00000228794' 'ENSG00000223764' ...
 'ENSG00000075673' 'ENSG00000253103' 'ENSG00000281560']
['age', 'barcodes', 'cell_type', 'dataset', 'disease', 'genes_per_cell', 'genotype', 'group', 'individual', 'n_counts', 'sex']
(15859, 816)
[[22 28 12 19 14 19 22 19  5  9 24 12 14 10 15 12  8 11 14 14 16 10  4 10
   9 11 14 17 17 16  8 17  7  9  6  8  4  8  2  2  7  2 12 10  9  3  4 14
  15 15 10 10 18 11  2 11  5 17 17  6  4  7  8  8  8  4  9 16  7  8  5 15
   5  6 11 16  4 12  9 10  5  3 10  4  4  8  9  8  3 13 15  7  3 12 10  8
   8  9  9 14  5  5 11  3 16  7 13  5  9 13  4  7  3 18  7  6  7  7 14 25
  10  2  7  5  5  2 13  9  8  9 12 14  6 21  8  8  5  8 14  9  7 13  5 19
   6  8  8 13 12 11 11  6  4  8  4 12  9  2  3  2  5  6  9 12  4  7  5  6
   8  5 10  6  7  4 11  7 15  2  7 16  1  9  3  4 12  7  8  3  6  8 10  8
  10 15  5  6  9  8 17  3 10  5  1  5  8  4 15  6 11  2  2  3  9  

In [3]:
import loompy

# Path to your Loom file
loom_file_path = f'/data/mcri_heartv2/mcri_nedar/DCM-Foundation-Model-data/siRNA/loom_ipsc/patient_ZNF736.loom'

# Open the Loom file using loompy.connect
with loompy.connect(loom_file_path) as data:
    # Get a list of column attributes
    column_attributes = data.ca.keys()  # All column attributes
    print("Column attributes:", column_attributes)
    
    # Inspect the data types of each column attribute
    for attribute in column_attributes:
        attr_data = data.ca[attribute]
        print(f"Column attribute: {attribute}")
        print(f"Data type: {attr_data.dtype}")
        print(f"Shape: {attr_data.shape}")
        print(f"First few values: {attr_data[:5]}")
        print("-" * 50)
        
    print(data.ra.keys()) #['chr', 'description', 'ensembl_id', 'entrez_id', 'genome', 'symbol', 'type']
    print(data.ra['ensembl_id'].shape) #(15859,)
    print(data.ra['ensembl_id'][:]) #['ENSG00000237491' 'ENSG00000228794' 'ENSG00000223764' ... 'ENSG00000206177' 'ENSG00000169877' 'ENSG00000227674']
    print(data.ca.keys()) #['age', 'barcodes', 'cell_type', 'dataset', 'disease', 'genes_per_cell', 'genotype', 'group', 'individual', 'n_counts', 'sex']
    #print(data.ca['barcodes'].shape) #(859,)
    #print(data.ca['barcodes'][:]) #['CCTTCAGGTTAGTCGT-1' 'GACTTCCGTTTACCAG-1' 'TCTAACTCATGTTCAG-1' ... 'TAGACCACAAATGAAC-1' 'TAGGAGGAGTTGTCGT-1' 'GCAACATCACCCAATA-1']
    print(data.shape) #(15466, 859)
    print(data[data.ra['ensembl_id'] == "ENSG00000118194", :]) #[[ 48 106  70 ...   0   0   0]]
    print(data.ca['disease'] [:])


Column attributes: ['CellID', 'cell_type', 'disease', 'n_counts', 'orig.ident', 'replicate', 'treatment', 'unique_cell_id']
Column attribute: CellID
Data type: object
Shape: (8873,)
First few values: ['A07_A07_RT_BC_100_Lig_BC_143' 'A07_A07_RT_BC_100_Lig_BC_160'
 'A07_A07_RT_BC_100_Lig_BC_166' 'A07_A07_RT_BC_101_Lig_BC_10'
 'A07_A07_RT_BC_101_Lig_BC_152']
--------------------------------------------------
Column attribute: cell_type
Data type: object
Shape: (8873,)
First few values: ['CM' 'CM' 'CM' 'CM' 'CM']
--------------------------------------------------
Column attribute: disease
Data type: object
Shape: (8873,)
First few values: ['DCM' 'DCM' 'DCM' 'DCM' 'DCM']
--------------------------------------------------
Column attribute: n_counts
Data type: int32
Shape: (8873,)
First few values: [350 885 233 530 402]
--------------------------------------------------
Column attribute: orig.ident
Data type: object
Shape: (8873,)
First few values: ['patient' 'patient' 'patient' 'patient' 'pa