# RUN FIRST (Imports & Functions)

In [None]:
import numpy as np
import pandas as pd
from copy import deepcopy
import warnings
import re
import glob

import scanpy as sc
from sklearn.decomposition import PCA
from umap import UMAP
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.io import mmread
import mygene

# Set display options
pd.set_option('display.max_columns', 100)

# Suppress warnings
warnings.filterwarnings('ignore')

In [None]:
def plot_pca(data, meta, color=None):
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(data)

    #plt.figure(figsize=(7,6))
    #sns.scatterplot(x=reduced[:, 0], y=reduced[:, 1])

    if color:
        for category in color:
            plt.figure(figsize=(7,6))
            sns.scatterplot(x=reduced[:, 0], y=reduced[:, 1], hue=meta[category])

def plot_umap(data, meta, color=None, partial_fit=1.0, figsize=(7, 6)):
    umap = UMAP(n_components=2)
    #fit_data, fit_meta, left_data, left_meta = train_test_split()
    reduced = umap.fit_transform(data)

    #plt.figure(figsize=(7,6))
    #sns.scatterplot(x=redu
    # ced[:, 0], y=reduced[:, 1])

    if color:
        for category in color:
            plt.figure(figsize=figsize)
            sns.scatterplot(x=reduced[:, 0], y=reduced[:, 1], hue=meta[category])

def prepare_raw_from_h5ad(data, debug=False):
        matrix = pd.DataFrame(data.X.todense())
        matrix.columns = data.var.index
        matrix.index = data.obs.index
        metadata = data.obs
        if debug:
            print(f'Prepared matrix with shape {matrix.shape}')
        return (matrix, metadata)

def process_h5ad(folder, tissue, threshold=0.2, only_filtered=False):

    # Process h5 data
    h5raw = sc.read_h5ad(f'{folder}/{tissue}.h5ad')
    sc.pp.normalize_total(h5raw, target_sum=1e4)
    sc.pp.log1p(h5raw, base=10)

    # PCR by scanpy
    #sc.tl.pca(h5raw)
    #sc.pl.pca(h5raw, color=['method', 'age', 'sex'], title=f'PCA: lung', save='result.png')

    def prepare_raw_from_h5ad(data, debug=False):
        matrix = pd.DataFrame(data.X.todense())
        matrix.columns = data.var.index
        matrix.index = data.obs.index
        metadata = data.obs
        if debug:
            print(f'Prepared matrix with shape {matrix.shape}')
        return (matrix, metadata)

    h5_data, metadata = prepare_raw_from_h5ad(h5raw)
    #h5_input = h5_data.to_numpy()

    if not only_filtered:
        np.save(f'{folder}/{tissue}_full_genes.npy', list(h5_data.columns))
        np.save(f'{folder}/{tissue}_full.npy', h5_data)

    # Add age info into metadata
    ages = [59, 61, 57, 38, 40, 67, 69, 56, 37, 33, 42, 74, 22, 59, 46]
    donor2age = {f'TSP{i+1}':j for i, j in enumerate(ages)}
    metadata['age'] = metadata.apply(lambda row: donor2age[row.donor], axis=1)

    metadata.to_csv(f'{folder}/{tissue}_meta.csv')
    

    # Remove all genes which are expressed in less than X of cells
    # Very robust and straightforward way to reduce # features
    cell_threshold = int(h5_data.shape[0] * threshold)
    filtered_data = (h5_data.loc[:,h5_data.astype(bool).sum(axis=0) > cell_threshold])
    filt_input = filtered_data.to_numpy()

    np.save(f'{folder}/{tissue}_filtered_genes.npy', list(filtered_data.columns))
    np.save(f'{folder}/{tissue}_filtered.npy', filt_input)


def combine_tissues_and_methods_both(tissues, output_name, threshold=0.2):
    combined_data = None
    combined_metadata = None

    
    for tissue in tissues:
        droplet_data_present = False
        facs_data_present = False
        
        try:
            droplet_h5raw = sc.read_h5ad(f'data/mouse/droplet_{tissue}.h5ad')
            d_h5_data, d_metadata = prepare_raw_from_h5ad(droplet_h5raw)
            droplet_data_present = True
        except FileNotFoundError:
            print(f"Droplet data for {tissue} not found.")
        
        try:
            facs_h5raw = sc.read_h5ad(f'data/mouse/facs_{tissue}.h5ad')
            f_h5_data, f_metadata = prepare_raw_from_h5ad(facs_h5raw)
            facs_data_present = True
        except FileNotFoundError:
            print(f"FACS data for {tissue} not found.")
        
        if droplet_data_present and facs_data_present:
            # Identify common columns
            common_columns = d_h5_data.columns.intersection(f_h5_data.columns)
            # Filter both DataFrames to only include common columns
            d_filtered = d_h5_data[common_columns]
            f_filtered = f_h5_data[common_columns]
            # Concatenate by rows
            X = pd.concat([d_filtered, f_filtered])
            y = pd.concat([d_metadata, f_metadata])
        elif droplet_data_present:
            X = d_h5_data
            y = d_metadata
        elif facs_data_present:
            X = f_h5_data
            y = f_metadata
        else:
            print(f"No data available for {tissue} using either method.")
            continue
        
        if combined_data is None:
            combined_data = X
            combined_metadata = y
        else:
            combined_data = pd.concat([combined_data, X])
            combined_metadata = pd.concat([combined_metadata, y])
    
    if combined_data is not None:
        # Filter out genes with too few cells
        threshold_n_cells = int(combined_data.shape[0] * threshold)
        filtered_data = combined_data.loc[:, combined_data.astype(bool).sum(axis=0) > threshold_n_cells]
        combined_metadata['age'] = combined_metadata['age'].str.extract(r'(\d+)').astype(int)
        combined_metadata.rename(columns={'mouse.id': 'donor'}, inplace=True)

        # Print the total number of rows and columns in the combined data
        print(f"Combined data dimensions: {filtered_data.shape[0]} rows, {filtered_data.shape[1]} columns")

        combined_metadata.to_csv(f'data/mouse/{output_name}_meta.csv')
        np.save(f'data/mouse/{output_name}_filtered_genes.npy', list(filtered_data.columns))
        np.save(f'data/mouse/{output_name}_filtered.npy', filtered_data.to_numpy())
    else:
        print("No combined data was created, please check the availability of the input files.")


def h5ad_to_pandas(folder, method, tissue):

    # Process h5 data
    h5raw = sc.read_h5ad(f'{folder}/{method}_{tissue}.h5ad')
    sc.pp.normalize_total(h5raw, target_sum=1e4)
    sc.pp.log1p(h5raw, base=10)

    # PCR by scanpy
    #sc.tl.pca(h5raw)
    #sc.pl.pca(h5raw, color=['method', 'age', 'sex'], title=f'PCA: lung', save='result.png')

    def prepare_raw_from_h5ad(data, debug=False):
        matrix = pd.DataFrame(data.X.todense())
        matrix.columns = data.var.index
        matrix.index = data.obs.index
        metadata = data.obs
        if debug:
            print(f'Prepared matrix with shape {matrix.shape}')
        return (matrix, metadata)

    h5_data, metadata = prepare_raw_from_h5ad(h5raw)
    #h5_input = h5_data.to_numpy()

    return metadata, h5_data

def analysis(datasets):
    for dataset in datasets:
        print(dataset.shape)


def load_h5ad_data(folder, tissue, filtered=True):
    
    H5_FOLDER = f'data/{folder}'
    tissue = tissue

    if filtered:
        df_type = 'filtered'
    else:
        df_type = 'full'

    gene_list = np.load(f'{H5_FOLDER}/{tissue}_{df_type}_genes.npy')
    X = np.load(f'{H5_FOLDER}/{tissue}_{df_type}.npy')
    metadata = pd.read_csv(f'{H5_FOLDER}/{tissue}_meta.csv', index_col=0)
        
    y = deepcopy(metadata)
    print(f'Data shape: {X.shape}')

    return (X, y, gene_list)


def combine_methods_for_tissues(tissues, threshold=0.05):
    for tissue in tissues:
        droplet_h5raw = sc.read_h5ad(f'data/mouse/droplet_{tissue}.h5ad')
        facs_h5raw = sc.read_h5ad(f'data/mouse/facs_{tissue}.h5ad')
        d_h5_data, d_metadata = prepare_raw_from_h5ad(droplet_h5raw)
        f_h5_data, f_metadata = prepare_raw_from_h5ad(facs_h5raw)

        # Step 1: Identify common columns
        common_columns = d_h5_data.columns.intersection(f_h5_data.columns)

        # Step 2: Filter both DataFrames to only include common columns
        d_filtered = d_h5_data[common_columns]
        f_filtered = f_h5_data[common_columns]

        # Step 3: Concatenate by rows
        X = pd.concat([d_filtered, f_filtered])
        y = pd.concat([d_metadata, f_metadata])

        # Step 4: Filter out genes with too few cells
        threshold_n_cells = int(X.shape[0] * threshold)
        filtered_data = X.loc[:,X.astype(bool).sum(axis=0) > threshold_n_cells]
        y['age'] = y['age'].str.extract(r'(\d+)').astype(int)
        y.rename(columns={'mouse.id': 'donor'}, inplace=True)

        #plot_pca(filtered_data, y, color=('method',))

        # Print the total number of rows and columns in the combined data
        print(f"Combined data dimensions: {filtered_data.shape[0]} rows, {filtered_data.shape[1]} columns")
        
        y.to_csv(f'data/mouse/{tissue}_meta.csv')
        np.save(f'data/mouse/{tissue}_filtered_genes.npy', list(filtered_data.columns))
        if threshold == 0:
            np.save(f'data/mouse/{tissue}.npy', filtered_data.to_numpy())
        else:
            np.save(f'data/mouse/{tissue}_filtered.npy', filtered_data.to_numpy())


def combine_FACS_tissues(tissues, output_name, threshold=0.2):
    combined_data = None
    combined_metadata = None

    for tissue in tissues:
        facs_h5raw = sc.read_h5ad(f'data/mouse/facs_{tissue}.h5ad')
        X, y = prepare_raw_from_h5ad(facs_h5raw)

        if combined_data is None:
            combined_data = X
            combined_metadata = y
        else:
            combined_data = pd.concat([combined_data, X])
            combined_metadata = pd.concat([combined_metadata, y])

    # Step 4: Filter out genes with too few cells
    threshold_n_cells = int(combined_data.shape[0] * threshold)
    filtered_data = combined_data.loc[:, combined_data.astype(bool).sum(axis=0) > threshold_n_cells]
    combined_metadata['age'] = combined_metadata['age'].str.extract(r'(\d+)').astype(int)
    combined_metadata.rename(columns={'mouse.id': 'donor'}, inplace=True)

    # Print the total number of rows and columns in the combined data
    print(f"Combined data dimensions: {filtered_data.shape[0]} rows, {filtered_data.shape[1]} columns")

    combined_metadata.to_csv(f'data/mouse/{output_name}_meta.csv')
    np.save(f'data/mouse/{output_name}_filtered_genes.npy', list(filtered_data.columns))
    np.save(f'data/mouse/{output_name}_filtered.npy', filtered_data.to_numpy())


def load_data(dataset, tissue, filtered=True, normalize=False, verbose=True):
    H5_FOLDER = f'data/{dataset}'
    tissue = tissue

    if filtered:
        df_type = 'filtered'
    else:
        df_type = 'full'

    gene_list = np.load(f'{H5_FOLDER}/{tissue}_{df_type}_genes.npy')
    X = np.load(f'{H5_FOLDER}/{tissue}_{df_type}.npy')
    metadata = pd.read_csv(f'{H5_FOLDER}/{tissue}_meta.csv', index_col=0)
        
    y = deepcopy(metadata)
    
    if normalize:
        X = np.log1p(X)
        if verbose:
            print('Data normalized.')
    
    if verbose:
        print(f'Data shape: {X.shape}')
    return (X, y, gene_list)

# Tabula Muris Senis

In [None]:
# Run once to rename

import os

for file_path in glob.glob("/Users/mindblaze/Desktop/Thesis/clocks/data/mouse/*droplet*"):
    os.rename(file_path, '/Users/mindblaze/Desktop/Thesis/clocks/data/mouse/droplet_'+file_path.split('-')[-1])
    
for file_path in glob.glob("/Users/mindblaze/Desktop/Thesis/clocks/data/mouse/*facs*"):
    os.rename(file_path, '/Users/mindblaze/Desktop/Thesis/clocks/data/mouse/facs_'+file_path.split('-')[-1])

In [None]:
# Get tissues with one and both cell-ssorting methods

import glob

droplet_paths = glob.glob("data/mouse/*droplet*")
facs_paths = glob.glob("data/mouse/*facs*")
print(droplet_paths)

droplet_tissues = set()
facs_tissues = set()
for file_path in droplet_paths:
    droplet_tissues.add(file_path.split("droplet_")[-1].split(".")[0])
for file_path in facs_paths:
    facs_tissues.add(file_path.split("facs_")[-1].split(".")[0])

both_tissues = droplet_tissues.intersection(facs_tissues)
print(len(droplet_tissues), len(facs_tissues), len(both_tissues))


droplet_only_tissues = droplet_tissues - facs_tissues
facs_only_tissues = facs_tissues - droplet_tissues

both_tissues = list(both_tissues)
all_tissues = droplet_tissues.union(facs_tissues)

In [None]:
print("Tissue | Droplet | FACS")
for tissue in both_tissues:
    droplet_h5raw = sc.read_h5ad(f'data/mouse/droplet_{tissue}.h5ad')
    facs_h5raw = sc.read_h5ad(f'data/mouse/facs_{tissue}.h5ad')
    print(f"{tissue} | {droplet_h5raw.X.shape} | {facs_h5raw.X.shape}")
warnings.filterwarnings('default')

In [None]:
# Combine all tissues into one file
combine_tissues_and_methods_both(list(all_tissues), 'ALL_METHODS', threshold=0.0, all_methods=True)

In [None]:
# Create separate data file for each tissue

tissues = all_tissues
threshold = 0.0
combine_methods_for_tissues(tissues, threshold)

In [None]:
# Create datafile for all tissues that have both methods

output_name = 'BOTH'
threshold = 0.2
combine_tissues_and_methods_both(both_tissues, output_name, threshold)

In [None]:
# Create datafile for all tissues that have only FACS method

output_name = 'ALL_FACS'
threshold = 0.0
combine_FACS_tissues(facs_only_tissues, output_name, threshold)

# Tabula Sapiens

## Load and join data for Epithelial, Endothelial and Stromal

In [None]:
type_names = ["TS_epithelial",
              "TS_endothelial",
              "TS_stromal", 
              ]

for name in type_names:
    process_h5ad(folder='data/sapiens', tissue=name, threshold=0.0, only_filtered=True)

In [None]:
# 

full_set = set()
for name in ["TS_epithelial", "TS_endothelial"]:
    _, _, gene_list = load_h5ad_data('sapiens', name, filtered=True)
    if len(full_set) == 0:
        full_set = set(gene_list)
    full_set = full_set.intersection(set(gene_list))

X, y1, gene_list = load_h5ad_data('sapiens', "TS_epithelial", filtered=True)
X1 = pd.DataFrame(X).loc[:,pd.Index(gene_list).isin(full_set)].to_numpy()
#y1 = y.loc[:,pd.Index(gene_list).isin(full_set)]

X, y2, gene_list = load_h5ad_data('sapiens', "TS_endothelial", filtered=True)
X2 = pd.DataFrame(X).loc[:,pd.Index(gene_list).isin(full_set)].to_numpy()
#y2 = y.loc[:,pd.Index(gene_list).isin(full_set)]

X, y3, gene_list = load_h5ad_data('sapiens', "TS_stromal", filtered=True)
X3 = pd.DataFrame(X).loc[:,pd.Index(gene_list).isin(full_set)].to_numpy()
#y3 = y.loc[:,pd.Index(gene_list).isin(full_set)]

X = np.concatenate([X1, X2, X3])
y = pd.concat([y1, y2, y3])

Xn, yn, gene_listn = load_h5ad_data('sapiens', "TS_endothelial", filtered=True)
np.save(f'data/sapiens/ees_filtered_genes.npy', list(pd.DataFrame(Xn).loc[:,pd.Index(gene_listn).isin(full_set)].columns))
np.save(f'data/sapiens/ees_filtered.npy', X)
y.to_csv(f'data/sapiens/ees_meta.csv')

## Get only high variance genes from the preprocessed data

In [None]:
X, y, gene_list = load_data('sapiens',
                            'TS_EES',)

Data shape: (218317, 51078)


In [None]:
def variance_thresholding(X, threshold=0.01, chunk_size=1000):
    num_samples, num_features = X.shape
    retained_indices = []

    for start in range(0, num_features, chunk_size):
        end = min(start + chunk_size, num_features)
        chunk = X[:, start:end]
        variances = np.var(chunk, axis=0)
        retained_indices.extend(np.where(variances > threshold)[0] + start)
    
    return retained_indices

# Apply variance thresholding
threshold = 0.05  # Example threshold value
retained_indices = variance_thresholding(X, threshold=threshold)

# Filter the data
X_high_variance = X[:, retained_indices]
gene_list_high_variance = gene_list[retained_indices]


In [None]:
X_high_variance.shape

(218317, 6926)

In [None]:
# Save the filtered data
H5_FOLDER = 'data/sapiens'  # Replace with your directory
np.save(f'{H5_FOLDER}/TS_EES_hv_filtered.npy', X_high_variance)
np.save(f'{H5_FOLDER}/TS_EES_hv_filtered_genes.npy', gene_list_high_variance)
y.to_csv(f'{H5_FOLDER}/TS_EES_hv_meta.csv')

## Choose three donors from different age groups

In [None]:
# Donor ages provided
ages = {
    'TSP6': 67, 'TSP7': 69, 'TSP4': 38, 'TSP5': 40,
    'TSP3': 57, 'TSP10': 33, 'TSP12': 74, 'TSP9': 37,
    'TSP8': 56, 'TSP14': 59, 'TSP15': 46, 'TSP1': 59, 'TSP2': 61
}

# Convert the ages to a DataFrame
ages_df = pd.DataFrame(list(ages.items()), columns=['Donor', 'Age'])

# Sort the donors by age
ages_df = ages_df.sort_values(by='Age')

# Divide the donors into three groups: younger, middle, and older
n = len(ages_df)
younger_group = ages_df.iloc[:n//3]
middle_group = ages_df.iloc[n//3:2*n//3]
older_group = ages_df.iloc[2*n//3:]

# Select one donor from each group
selected_test_donors = [
    younger_group.sample(1).iloc[0]['Donor'],
    middle_group.sample(1).iloc[0]['Donor'],
    older_group.sample(1).iloc[0]['Donor']
]

print("Selected Test Donors:", selected_test_donors)


Selected Test Donors: ['TSP5', 'TSP3', 'TSP2']


# External datasets (.mtx)

In [None]:
data_path = 'data/mouse/E-MTAB-8077-quantification-raw-files/E-MTAB-8077.aggregated_filtered_counts.mtx'

# Read the .mtx file using scipy.io.mmread()
sparse_matrix = mmread(data_path)

# Convert the sparse matrix to a dense NumPy array
expression_data = sparse_matrix.toarray().T


with open(data_path+'_rows', 'r') as file:
    gene_names = file.read().splitlines()
gene_names = [gene.split('\t')[0] for gene in gene_names]


# Read the cell names
with open(data_path+'_cols', 'r') as file:
    cell_names = file.read().splitlines()

metadata_path = 'data/mouse/E-MTAB-8077-quantification-raw-files/ExpDesign-E-MTAB-8077.tsv'

with open(metadata_path) as file:
    metadata = pd.read_csv(file, sep='\t')

# Function to extract text within square brackets
def extract_bracket_text(text):
    match = re.search(r'\[(.*?)\]', text)
    return match.group(1) if match else text

# Rename columns to only have the parts within square brackets
metadata.columns = [extract_bracket_text(col) for col in metadata.columns]

# Handle duplicates by appending a suffix to duplicate names
def handle_duplicates(cols):
    counts = {}
    new_cols = []
    for col in cols:
        if col in counts:
            counts[col] += 1
            new_cols.append(f"{col}.{counts[col]}")
        else:
            counts[col] = 0
            new_cols.append(col)
    return new_cols

new_columns = [extract_bracket_text(col) for col in metadata.columns]

# Apply the function to handle duplicates
metadata.columns = handle_duplicates(new_columns)

# Remove duplicated rows based on the 'ENA_SAMPLE' column
#metadata = metadata.drop_duplicates(subset='ENA_SAMPLE')

# Print the shape of the resulting NumPy array
print("Expression data shape:", expression_data.shape)

# Print the number of genes and cells
print("Number of genes:", len(gene_names))
print("Number of cells:", len(cell_names))

In [None]:
mg = mygene.MyGeneInfo()

# Query gene symbols using Ensembl gene IDs
gene_info = mg.querymany(gene_names, scopes='ensembl.gene', fields='symbol', species='mouse')

# Create a dictionary to store the mapping of Ensembl gene IDs to gene symbols
gene_id_to_symbol = {}

# Iterate over the query results and populate the dictionary
for gene in gene_info:
    ensembl_id = gene.get('query')
    symbol = gene.get('symbol')
    
    if ensembl_id and symbol:
        gene_id_to_symbol[ensembl_id] = symbol

# Create a list to store the gene symbols without duplicates
gene_symbols = []

# Iterate over the original gene names and retrieve the corresponding gene symbols
for gene_id in gene_names:
    symbol = gene_id_to_symbol.get(gene_id, 'N/A')
    gene_symbols.append(symbol)

# Print the number of genes and the number of genes with no hit
print(f"Total genes: {len(gene_names)}")
print(f"Genes with no hit: {gene_symbols.count('N/A')}")


In [None]:
folder = 'data/mouse'
tissue = '8077'

metadata.to_csv(f'{folder}/{tissue}_meta.csv')
np.save(f'{folder}/{tissue}_filtered_genes.npy', gene_symbols)
np.save(f'{folder}/{tissue}_filtered.npy', expression_data)