# Single cell analysis using scanpy

### Importing libraries

In [None]:
import numpy as np
import os
import scanpy as sc
from scipy.sparse import csr_matrix

In [None]:
## Setting random seed
np.random.seed(12345)

### Reading raw samples

In [None]:
path_to_raw_files = "/path/to/Raw_Files/"
files = os.listdir(path_to_raw_files)

In [None]:
# List to store the AnnData objects
mouse_sorted_ECs_list = []

for file in files:
    file_path = os.path.join(path_to_raw_files, file)
    
    # Read 10X data (assuming matrix.mtx and barcodes, features files are in the subdirectories)
    data = sc.read_10x_mtx(file_path)
    
    # Create AnnData object (equivalent to Seurat object)
    adata = sc.AnnData(data.X)
    
    # Assign project name (Seurat's 'project' is equivalent to naming the AnnData object)
    adata.obs['project'] = file
    
    # Rename cells by combining the original identifier with rownames (cell barcodes)
    adata.obs_names = [f"{file}_{cell}" for cell in adata.obs_names]
    
    # Append the AnnData object to the list
    mouse_sorted_ECs_list.append(adata)


In [None]:
### Adding mito and ribo percentage

In [None]:
# Define a function to calculate mito and ribo gene percentages
def add_mito_ribo(adata, mito_prefix="Mt-", ribo_prefix="Rb-"):
    mito_genes = adata.var_names.str.startswith(mito_prefix)
    ribo_genes = adata.var_names.str.startswith(ribo_prefix)
    
    adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1 * 100
    adata.obs['percent_ribo'] = np.sum(adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1 * 100
    
    return adata

# Apply the function to each AnnData object in the list
mouse_sorted_ECs_list = [add_mito_ribo(adata) for adata in mouse_sorted_ECs_list]

### Quality check

In [None]:
for adata in mouse_sorted_ECs_list:
    sc.pl.violin(adata, ['n_counts', 'n_genes', 'percent_mito', 'percent_ribo'], 
                 jitter=0.4, multi_panel=True)

In [None]:
for adata in mouse_sorted_ECs_list:
    sc.pl.scatter(adata, x='n_counts', y='n_genes')

In [None]:
# Create a dictionary to store each AnnData object with its filename as the key
mouse_sorted_ECs_dict = {file: adata for file, adata in zip(files, mouse_sorted_ECs_list)}

In [None]:
# Access each object by referencing it through the dictionary, like mouse_sorted_ECs_dict['filename'].

In [None]:
### Filtering low-quality cells

In [None]:
# Subsetting based on nCount_RNA, nFeature_RNA, and mitochondrial percentage
HFD_Epi_A = HFD_Epi_A[(HFD_Epi_A.obs['n_counts'] > 200) & 
                      (HFD_Epi_A.obs['n_counts'] < 7500) &
                      (HFD_Epi_A.obs['n_genes'] > 200) & 
                      (HFD_Epi_A.obs['n_genes'] < 3000) & 
                      (HFD_Epi_A.obs['percent_mito'] < 20), :]

### Removing ribosomal/mito/other genes

In [None]:
# Remove ribosomal genes (starting with Rp[l|s])
raw_sample_list_NoRibo = {name: adata[:, ~adata.var_names.str.contains('^RP[L|S]', regex=True)] 
                          for name, adata in Mouse_sorted_ECs_list.items()}

# Remove mitochondrial genes (starting with mt-)
raw_sample_list_NoRbMt = {name: adata[:, ~adata.var_names.str.startswith('MT-')] 
                          for name, adata in raw_sample_list_NoRibo.items()}

# Remove long non-coding RNAs (Malat1, Neat1)
raw_sample_list_NoRbMt_lnc = {name: adata[:, ~adata.var_names.str.contains('^MALAT1|NEAT1', regex=True)] 
                              for name, adata in raw_sample_list_NoRbMt.items()}

hemoglobin_genes = ["HBB", "HBG2", "HBZ", "HBA2", "HBA1",
                    "HBM", "HBD", "HBE1", "HBQ1", "HBG1"]

# Apply the filtering function to exclude hemoglobin genes
raw_sample_list_NoRbMt_genes = {name: adata[:, ~adata.var_names.isin(hemoglobin_genes)] 
                                for name, adata in raw_sample_list_NoRbMt_lnc.items()}

### Removing cells using lm

In [None]:
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Get log-transformed nFeature_RNA and nCount_RNA
log_nFeature_RNA = np.log(HFD_Epi_A.obs['n_genes'])
log_nCount_RNA = np.log(HFD_Epi_A.obs['n_counts'])

# Fit a linear model using statsmodels
X = sm.add_constant(log_nCount_RNA)  # Adds an intercept to the model
model = sm.OLS(log_nFeature_RNA, X).fit()

# Get residuals
residuals = model.resid

# Filter cells with residuals >= -0.5
tokeep = residuals[residuals >= -0.5].index
HFD_Epi_A = HFD_Epi_A[tokeep, :]

# Plotting
plt.scatter(log_nCount_RNA, log_nFeature_RNA, c='grey', label='All cells', s=10)
plt.plot(log_nCount_RNA, model.fittedvalues, color='red', linewidth=3, label='Linear fit')
plt.scatter(log_nCount_RNA.loc[tokeep], log_nFeature_RNA.loc[tokeep], c='blue', label='Filtered cells', s=20)
plt.xlabel('log(nCount_RNA)')
plt.ylabel('log(nFeature_RNA)')
plt.legend()
plt.show()

### Merging samples

In [None]:
import scanpy as sc
import pandas as pd

# Merge multiple AnnData objects
Merged_raw_object = HFD_Epi_A.concatenate(HFD_Epi_B, HFD_Mes_A, HFD_Mes_B, 
                                          NC_Epi_A, NC_Epi_B, NC_Mes_A, NC_Mes_B, 
                                          index_unique=None)  # Don't append object names to index

# Display merged object
print(Merged_raw_object)

### Saving merged objecct

In [None]:
# Save the updated AnnData object if needed
adata.write('adata.h5ad')

### Adding metadata info

In [None]:
# Convert .obs to a pandas DataFrame
df_Merged_raw_object = Merged_raw_object.obs.copy()

# Select relevant columns (assuming Mt_Rb is present)
df_Merged_raw_object = df_Merged_raw_object[['orig.ident', 'n_counts', 'n_genes', 'percent_mito', 'Mt_Rb']]

# Add "Condition" column based on "orig.ident"
df_Merged_raw_object['Condition'] = df_Merged_raw_object['orig.ident']

# Replace Condition values based on orig.ident
df_Merged_raw_object['Condition'] = df_Merged_raw_object['Condition'].replace(
    {'NC_Epi_A': 'NC', 'NC_Epi_B': 'NC', 'NC_Mes_A': 'NC', 'NC_Mes_B': 'NC', 
     'HFD_Epi_A': 'HFD', 'HFD_Epi_B': 'HFD', 'HFD_Mes_A': 'HFD', 'HFD_Mes_B': 'HFD'})

# Add "Depot" column based on "orig.ident"
df_Merged_raw_object['Depot'] = df_Merged_raw_object['orig.ident']

# Replace Depot values based on orig.ident
df_Merged_raw_object['Depot'] = df_Merged_raw_object['Depot'].replace(
    {'NC_Epi_A': 'Epi', 'NC_Epi_B': 'Epi', 'HFD_Epi_A': 'Epi', 'HFD_Epi_B': 'Epi',
     'NC_Mes_A': 'Mes', 'NC_Mes_B': 'Mes', 'HFD_Mes_A': 'Mes', 'HFD_Mes_B': 'Mes'})

# Show updated DataFrame
df_Merged_raw_object.head()

In [None]:
# Assuming df_In_House is a DataFrame in Python
df_In_House['BMI'] = df_In_House['orig.ident']

# Update BMI based on the presence of 'SAT9' in the 'orig.ident' column
df_In_House['BMI'] = df_In_House['BMI'].replace(to_replace=r'SAT9', value='39.5', regex=True)

# Show updated df_In_House
df_In_House.head()

### Adding a column from existing columns

In [None]:
# Concatenate 'Study' and 'Chemistry' columns to create 'Study_chemistry'
All_Data_Atlas['Study_chemistry'] = All_Data_Atlas['Study'] + '_' + All_Data_Atlas['Chemistry']

# Display the updated DataFrame
print(All_Data_Atlas)

# Change a column into categorical

In [None]:
adata.obs['louvain'] = adata.obs['louvain'].astype('category')

# Show categories (factor levels)

In [None]:
print(adata.obs['louvain'].cat.categories)

## drop levels

In [None]:
### One column
adata.obs['louvain'] = adata.obs['louvain'].cat.remove_unused_categories()

In [None]:
### Multiple columns
for col in adata.obs.select_dtypes(['category']).columns:
    adata.obs[col] = adata.obs[col].cat.remove_unused_categories()

### Adding cell cycle info

In [None]:
import pandas as pd
import requests

# URL of the cell cycle gene list
url = "https://raw.githubusercontent.com/hbc/tinyatlas/master/cell_cycle/Homo_sapiens.csv"

# Download the file and read into a DataFrame
response = requests.get(url)
cell_cycle_genes = pd.read_csv(pd.compat.StringIO(response.text))

# Display the first few rows of the DataFrame
print(cell_cycle_genes.head())


from biomart import BiomartServer

# Connect to Biomart server
server = BiomartServer('http://www.ensembl.org/biomart')
mart = server.datasets['hsapiens_gene_ensembl']

# Get gene annotations
attributes = [
    'ensembl_gene_id', 'external_gene_name', 'chromosome_name', 
    'gene_biotype', 'description'
]
annotations = mart.query(attributes=attributes)
annotations = annotations.to_dataframe()

# Display the first few rows
print(annotations.head())


# Merge cell cycle genes with annotations
cell_cycle_markers = pd.merge(cell_cycle_genes, annotations, left_on='geneID', right_on='ensembl_gene_id')

# Get S phase genes
s_genes = cell_cycle_markers[cell_cycle_markers['phase'] == 'S']['external_gene_name'].tolist()

# Get G2M phase genes
g2m_genes = cell_cycle_markers[cell_cycle_markers['phase'] == 'G2/M']['external_gene_name'].tolist()


# Perform cell cycle scoring
sc.tl.score_genes(adata, gene_list=s_genes, score_name='S_score')
sc.tl.score_genes(adata, gene_list=g2m_genes, score_name='G2M_score')

# Display the first few rows of the updated metadata
print(adata.obs.head())



## Data Analysis

In [None]:
# Visualizing the expression of specific features
sc.pl.umap(adata, color=['ncount', 'ngenes'])  # Assuming 'nCount_RNA' and 'nFeature_RNA' are present

### Data Normalization

In [None]:
# Log-normalizing the data (similar to LogNormalize in Seurat)
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

### Highly variable genes

In [None]:
# Finding highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=2000)

### Scaling

In [None]:
# Scaling the data
sc.pp.scale(adata)  # You can also add options like `max_value`

## Regressing out variables


### PCA

In [None]:
# Running PCA
sc.tl.pca(adata, svd_solver='arpack')

#### Elbow plot

In [None]:
# Elbow plot
sc.pl.pca_variance_ratio(adata, log=True)

#### Significant PCs

In [None]:
# Get the explained variance ratio for each PC
pct = adata.uns['pca']['variance_ratio'] * 100
cum = pct.cumsum()
co1 = (cum > 90).argmax() + 1  # First component where cumulative variance exceeds 90%
co2 = (pct[1:] - pct[:-1] > 0.05).nonzero()[0].max() + 2
pcs = min(co1, co2)
sig_pcs = list(range(1, pcs+1))  # Significant PCs
print(sig_pcs)

### Finding neighbors

In [None]:
# Find neighbors using significant PCs
sc.pp.neighbors(adata, n_pcs=pcs)

### UMAP construction

In [None]:
# Run UMAP
sc.tl.umap(adata)
sc.pl.umap(adata)  # Plot UMAP

In [None]:
# Data Integration using harmony

In [None]:
import harmonypy as hm

# Assuming 'adata' is the AnnData object and 'orig.ident' is stored in 'adata.obs'
harmony_integrator = hm.HarmonyIntegrator(X=adata.obsm['X_pca'], meta_data=adata.obs['orig.ident'])
harmony_integrator.run()

# Add harmony embedding to the AnnData object
adata.obsm['X_harmony'] = harmony_integrator.Z_corr.T

#### Significant harmony components

In [None]:
# Get the explained variance ratio of Harmony components
pct = harmony_integrator.vars_ / harmony_integrator.vars_.sum() * 100
cum = pct.cumsum()
co1 = (cum > 90).argmax() + 1
co2 = (pct[1:] - pct[:-1] > 0.05).nonzero()[0].max() + 2
pcs = min(co1, co2)
harmony_sig_pcs = list(range(1, pcs + 1))
print(harmony_sig_pcs)

In [None]:
# Running UMAP using the Harmony dimensions
sc.pp.neighbors(adata, use_rep='X_harmony', n_pcs=pcs)
sc.tl.umap(adata)
sc.pl.umap(adata, color=['orig.ident'], legend_loc='on data')  # You can label points if needed

In [None]:
### Gene Signatures

In [None]:
# Reading Excel file
ECs_markers_ref = pd.read_excel("/home/lucamannino/Downloads/Vascular_markers_summary.xlsx", sheet_name=0)

# Splitting 'Capillary.ECs' column into two
ECs_markers_ref[['Capillary.ECs', '_']] = ECs_markers_ref['Capillary.ECs'].str.split(' ', expand=True)
ECs_markers_ref.drop(columns='_', inplace=True)

# Removing NaN values
ECs_markers_ref_cleaned = {col: ECs_markers_ref[col].dropna().tolist() for col in ECs_markers_ref.columns}

In [None]:
## Adding signature
from itertools import chain

for cell_type, gene_list in ECs_markers_ref_cleaned.items():
    # Compute the module score for each cell type
    sc.tl.score_genes(adata, gene_list, score_name=f'{cell_type}_signature')

# Rename columns to match the Seurat format if needed
adata.obs.columns = adata.obs.columns.str.replace('_signature1$', '_signature')

In [None]:
## Extract signature columns

# Extract columns with '_signature' in their names
signature_columns = [col for col in adata.obs.columns if col.endswith('_signature')]

In [None]:
import matplotlib.pyplot as plt

# Plot the feature expression of each signature column
sc.pl.umap(adata, color=signature_columns, ncols=3, cmap='viridis')

In [None]:
### Clustering

In [None]:
resolutions = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Perform clustering for different resolutions
for res in resolutions:
    sc.tl.leiden(adata, resolution=res, key_added=f'leiden_{res}')

In [None]:
# Visualize clusters identified by the Leiden algorithm across different resolutions
snn_columns = [col for col in adata.obs.columns if col.startswith('leiden')]
sc.pl.umap(adata, color=snn_columns, ncols=5)

In [None]:
# Set identity based on a specific resolution (e.g., res=0.3)
adata.obs['leiden_0.3'] = adata.obs['leiden_0.3'].astype('category')
adata.obs['leiden_0.3'].cat.categories

### Finding marker genes

In [None]:
# Find marker genes
sc.tl.rank_genes_groups(adata, groupby='leiden_0.3', method='wilcoxon', min_in_group_fraction=0.15, logfc_min=0.3)

# Get the top 50 and 100 marker genes per cluster
markers_df = sc.get.rank_genes_groups_df(adata, group='all')

# Top 50 genes per cluster
top50 = markers_df.groupby('group').apply(lambda x: x.nlargest(50, 'logfoldchanges')).reset_index(drop=True)

# Top 100 genes per cluster
top100 = markers_df.groupby('group').apply(lambda x: x.nlargest(100, 'logfoldchanges')).reset_index(drop=True)

### Exporting marker gene list

In [None]:
import pandas as pd

# Save as Excel
markers_df.to_excel("markers_Vascular_WRST_15PCT_Log03_res02.xlsx", index=True)

### Cluster annotation

In [None]:
# Define new names for clusters
new_names = {
    '0': '', 
    '1': '', 
    '2': '', 
    '3': '',
    '4': '',
    '5': '',
    '6': ''}

# Recode identities
adata_subset.obs['WAT_Vascular_labels'] = adata_subset.obs['leiden_0.3'].replace(new_names)

### Cluster subset (exact)

In [None]:
# Subset data to subset cluster '11'
adata_subset = adata[adata.obs['leiden_0.3'] == '11'].copy()

### Cluster subset (inclusion)

In [None]:
vascular_cells = adata[adata.obs['RNA_snn_res.0.5'].isin(['4', '6', '7', '11', '12', '13', '14', '24', '27'])]
vascular_cells

### Cluster subset (exclusion)

In [None]:
# Subset data to exclude cluster '11'
adata_subset = adata[adata.obs['leiden_0.3'] != '11'].copy()

## Saving all objects in the environment

In [None]:
import pickle
import sys

# Get all objects in the global namespace
all_objects = {name: obj for name, obj in globals().items() if not name.startswith('__') and not callable(obj)}

# Save all objects to a file
with open('my_workspace.pkl', 'wb') as file:
    pickle.dump(all_objects, file)


In [None]:
import pickle
import types

def is_pickleable(obj):
    """Check if an object can be pickled."""
    try:
        pickle.dumps(obj)
    except (pickle.PicklingError, TypeError, AttributeError):
        return False
    return True

def is_global(obj_name, obj):
    """Check if an object is globally accessible and not a local function or method."""
    return isinstance(obj, (types.ModuleType, types.FunctionType)) is False and not obj_name.startswith('__')

# Get all pickleable and globally accessible objects in the global namespace
pickleable_objects = {
    name: obj for name, obj in globals().items()
    if is_global(name, obj) and is_pickleable(obj)
}

# Save pickleable objects to a file
with open('workspace.pkl', 'wb') as file:
    pickle.dump(pickleable_objects, file)

## Loading an environment

In [None]:
import pickle

# Load all objects from a file
with open('my_workspace.pkl', 'rb') as file:
    loaded_objects = pickle.load(file)

# Restore all objects to the global namespace
globals().update(loaded_objects)

# Verify objects are loaded
print(loaded_objects)