In [1]:
import scanpy as sc
import pandas as pd
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import csv

In [2]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80, frameon=False, figsize=(20, 10), facecolor='white')

-----
anndata     0.11.3
scanpy      1.10.4
-----
PIL                         11.1.0
anyio                       NA
appnope                     0.1.2
asttokens                   NA
attr                        24.3.0
attrs                       24.3.0
babel                       2.16.0
backports                   NA
brotli                      1.0.9
certifi                     2025.01.31
charset_normalizer          3.3.2
comm                        0.2.1
cycler                      0.12.1
cython_runtime              NA
dateutil                    2.9.0.post0
debugpy                     1.8.11
decorator                   5.1.1
defusedxml                  0.7.1
exceptiongroup              1.2.0
executing                   0.8.3
fastjsonschema              NA
h5py                        3.12.1
idna                        3.7
ipykernel                   6.29.5
jaraco                      NA
jedi                        0.19.2
jinja2                      3.1.5
joblib                      1.4.

  mod_version = _find_version(mod.__version__)


Data in Peng et al. (2019) can be downloaded from https://singlecell.broadinstitute.org/single_cell/study/SCP212/molecular-specification-of-retinal-cell-types-underlying-central-and-peripheral-vision-in-primates.

In [3]:
# Load metadata file
metadata_file = './Macaque_NN_RGC_AC_BC_HC_PR_metadata_3.txt'
metadata_df = pd.read_csv(metadata_file)

# Initialize empty dictionaries for fovea and periphery
fovea_dict = {}
periphery_dict = {}

# Iterate through the DataFrame rows
for index, row in metadata_df.iterrows():
    cell_id = row['NAME']
    cluster = row['Cluster']
    subcluster = row['Subcluster']
    
    # Add to appropriate dictionary based on subcluster
    if subcluster == 'Fovea':
        fovea_dict[cell_id] = cluster
    elif 'Periphery' in subcluster or 'Per' in subcluster:
        periphery_dict[cell_id] = cluster

# Print dictionary sizes to verify
print(f"Fovea cells: {len(fovea_dict)}")
print(f"Periphery cells: {len(periphery_dict)}")

# Example entries
print("\nExample fovea entry:")
print(list(fovea_dict.items())[0])
print("\nExample periphery entry:")
if periphery_dict:
    print(list(periphery_dict.items())[0])

Fovea cells: 92626
Periphery cells: 73053

Example fovea entry:
('M1Fovea1_ACATACGCAAGCTGTT-1', 'M/L Cones')

Example periphery entry:
('M1CD90PNA_S1_AAAGCAATCCGTACAA-1', 'M/L Cones')


In [4]:
# Load the data into a pandas DataFrame
fname = './Macaque_fov_BC_expression.txt'
df_macaque_fov = pd.read_csv(fname)

# Set 'GENE' column as index
df_macaque_fov = df_macaque_fov.set_index('GENE')
df_macaque_fov

Unnamed: 0_level_0,M1Fovea1_AAACCTGAGATATACG.1,M1Fovea1_AAACCTGCACCGTTGG.1,M1Fovea1_AAACGGGAGAAGGGTA.1,M1Fovea1_AAACGGGCAAATCCGT.1,M1Fovea1_AAACGGGCATGCGCAC.1,M1Fovea1_AAACGGGGTGTTCTTT.1,M1Fovea1_AAAGATGAGGGTATCG.1,M1Fovea1_AAAGATGCAGATGAGC.1,M1Fovea1_AAAGATGTCCTTCAAT.1,M1Fovea1_AAAGCAAAGTGAATTG.1,...,M4Fovea3_TTTATGCTCCCATTAT.1,M4Fovea3_TTTCCTCGTGTCAATC.1,M4Fovea3_TTTGGTTCAGCGAACA.1,M4Fovea3_TTTGGTTGTCCGTCAG.1,M4Fovea3_TTTGGTTTCTGTTGAG.1,M4Fovea3_TTTGTCAGTCACCCAG.1,M4Fovea3_TTTGTCAGTCTAAACC.1,M4Fovea3_TTTGTCAGTGTAAGTA.1,M4Fovea3_TTTGTCATCCCTTGTG.1,M4Fovea3_TTTGTCATCTCAAGTG.1
GENE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZNF692,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.631431,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
ZNF672_p,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
SH3BP5L,0.0,0.0,0.782327,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.633187,0.0,0.0,0.0
LOC102117280,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
LOC102131547,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MSTRG.23237,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
MSTRG.23238,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
MSTRG.23242,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
MSTRG.23240,0.0,0.0,0.000000,0.0,0.0,0.0,0.696349,0.0,0.721963,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [5]:
# Now replace column names with corresponding values from dictionaries
new_columns = []
for col in df_macaque_fov.columns:
    key = col.replace('.', '-')
    new_columns.append(fovea_dict[key])

# Rename the columns
df_macaque_fov.columns = new_columns
df_macaque_fov = df_macaque_fov.T
df_macaque_fov

GENE,ZNF692,ZNF672_p,SH3BP5L,LOC102117280,LOC102131547,LOC107126559,LOC102141871,ZNF496,LOC102144076,LOC102145193,...,TMLHE,MSTRG.23232,MSTRG.23233,MSTRG.23234,MSTRG.23235,MSTRG.23237,MSTRG.23238,MSTRG.23242,MSTRG.23240,MSTRG.23243
DB3b,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FMB,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IMB,0.0,0.0,0.782327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB5*,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DB4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB4,0.0,0.0,0.633187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OFFx,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IMB,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Load the data into a pandas DataFrame
fname = './Macaque_per_BC_expression2.txt'
df_macaque_per = pd.read_csv(fname)

# Set 'GENE' column as index
df_macaque_per = df_macaque_per.set_index('GENE')
df_macaque_per

Unnamed: 0_level_0,M1CD90PNA_S1_AACTGGTAGTGGTCCC-1,M1CD90PNA_S1_AAGACCTTCTCTTATG-1,M1CD90PNA_S1_AAGGAGCAGTAGGCCA-1,M1CD90PNA_S1_AAGGTTCAGAGTCGGT-1,M1CD90PNA_S1_AAGTCTGCACGCCAGT-1,M1CD90PNA_S1_ACACCAACAGTTCATG-1,M1CD90PNA_S1_ACACTGATCTACCAGA-1,M1CD90PNA_S1_ACCCACTGTGATAAAC-1,M1CD90PNA_S1_ACCCACTTCCTGCTTG-1,M1CD90PNA_S1_ACCTTTAGTCATATGC-1,...,M4PerCD73S2_TTTGCGCCACTCGACG-1,M4PerCD73S2_TTTGCGCGTTCGGCAC-1,M4PerCD73S2_TTTGGTTAGCCAGTAG-1,M4PerCD73S2_TTTGGTTCAGATCCAT-1,M4PerCD73S2_TTTGGTTTCCTCTAGC-1,M4PerCD73S2_TTTGTCAAGCGCCTTG-1,M4PerCD73S2_TTTGTCAAGTGAATTG-1,M4PerCD73S2_TTTGTCAGTGACTACT-1,M4PerCD73S2_TTTGTCATCACCATAG-1,M4PerCD73S2_TTTGTCATCTGTTTGT-1
GENE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZNF692,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0000,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
ZNF672_p,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0000,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
SH3BP5L,0.0,0.000000,1.039235,0.0,0.0,0.000000,0.0,0.0000,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
LOC102131547,0.0,0.000000,0.000000,0.0,0.0,0.515308,0.0,0.0000,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
LOC107126559,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0000,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MSTRG.23237,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0000,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.591334,0.0,0.000000,0.0,0.000000
MSTRG.23238,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0000,0.0,0.000000,...,0.0,0.667713,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
MSTRG.23242,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0000,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
MSTRG.23240,0.0,0.594012,1.039235,0.0,0.0,0.853694,0.0,0.4595,0.0,0.904864,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1.154855,0.0,0.565862


In [7]:
# Now replace column names with corresponding values from dictionaries
new_columns = []
for col in df_macaque_per.columns:
    new_columns.append(periphery_dict[col])

# Rename the columns
df_macaque_per.columns = new_columns
df_macaque_per = df_macaque_per.T
df_macaque_per

GENE,ZNF692,ZNF672_p,SH3BP5L,LOC102131547,LOC107126559,ZNF496,LOC102144076,LOC102145193,ZNF124,ZNF669,...,TMLHE,MSTRG.23232,MSTRG.23233,MSTRG.23234,MSTRG.23235,MSTRG.23237,MSTRG.23238,MSTRG.23242,MSTRG.23240,MSTRG.23243
RB,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
FMB,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.594012,0.0
FMB,0.0,0.0,1.039235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,1.039235,0.0
DB2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
IMB,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.125773,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RB,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.591334,0.0,0.0,0.000000,0.0
RB,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
RB,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,1.154855,0.0
RB,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0


In [8]:
# Find common columns between the two dataframes
common_columns = df_macaque_fov.columns.intersection(df_macaque_per.columns)
len(common_columns)

16161

In [9]:
df_macaque_fov = df_macaque_fov[common_columns]
df_macaque_fov.to_pickle('mk_bc_fov.pkl')

In [10]:
df_macaque_per = df_macaque_per[common_columns]
df_macaque_per.to_pickle('mk_bc_per.pkl')