# Gene Typing: Coding vs Noncoding vs Pseudo--All aDEGs
Conceptualized by Monica E. Mesecar. Support for gene typing workflow: Dom J. Acri.
Code curated and conceptualized by Mesecar with support from Perplexity.
All code was checked and adjusted by Mesecar

In [None]:
#Load in Packages 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from matplotlib import colors
from matplotlib import rcParams
matplotlib.rcParams['pdf.fonttype'] = 42

## Getting Gene Types

In [None]:
# Read in Data for Ref Genome and your aDEGs
ref_genes_ch38=pd.read_csv("Gene_Typing/GRCh38-2020-A_GeneNameType.csv")
p1_adeg=pd.read_csv("P1_aging.glmmtmb_age_diffs_fdr_cleaned.csv")

In [None]:
#Check data type is dataframe
type(ref_genes_ch38)

In [None]:
#Display to check
display(ref_genes_ch38.head(15))

In [None]:
#Use this method to see typing options within reference genome
gene_type_ref_counts = ref_genes_ch38['gene_type'].value_counts()

In [None]:
gene_type_ref_counts

In [None]:
display(p1_adeg)

In [None]:
#Create lists of subsets of interest
p1_type_list=p1_adeg.type.unique().tolist()
print(p1_type_list)

In [None]:
#Subset only 'region_broad_celltype'
p1_region_broad_df = p1_adeg.loc[p1_adeg['type'] == 'region_broad_celltype']

In [None]:
#Show df
p1_region_broad_df 

In [None]:
p1_region_broad_df  = p1_region_broad_df [p1_region_broad_df ['celltype'] != 'Indeterminate']

In [None]:
#Renamed feature column to be more descriptive
p1_region_broad_df = p1_region_broad_df.rename(columns={'feature': 'gene_name'})

In [None]:
display(p1_region_broad_df)

In [None]:
p1_region_broad_df  = p1_region_broad_df [p1_region_broad_df['celltype'] != 'Indeterminate']

In [None]:
#Create list of unique aDEGs
p1_CT_region_genes=p1_region_broad_df.gene_name.unique().tolist()
#Print subset to check
print(p1_CT_region_genes[0:10])
#Interrogate total number 
len(p1_CT_region_genes)

In [None]:
# Filter the reference DataFrame for your aDEGs 
ref_genes_ch38_adeg_filtered = ref_genes_ch38[ref_genes_ch38['gene_name'].isin(p1_CT_region_genes)]

# Display the result
display(ref_genes_ch38_adeg_filtered.head(20))

# Shape
ref_genes_ch38_adeg_filtered.shape

In [None]:
# Interrogate gene types and value counts
gene_type_value_counts = ref_genes_ch38_adeg_filtered['gene_type'].value_counts()

In [None]:
print(gene_type_value_counts)

In [None]:
# Merge the 2 dataframes so that aDEGs now have type column
merged_df = pd.merge(p1_region_broad_df, ref_genes_ch38[['gene_name', 'gene_type']], on='gene_name', how='left')
display(merged_df.head(10))
merged_df.shape

## Cleaning and Formatting Data for Plotting

In [None]:
#Make dataframe more compact, show type counts by tissue
gene_type_distribution = merged_df.groupby(['tissue', 'gene_type']).size().unstack(fill_value=0)

In [None]:
gene_type_distribution 

In [None]:
#Split compact counts dataframe out by region and cell type
gene_type_distribution_split  = gene_type_distribution.reset_index()

In [None]:
gene_type_distribution_split[['region', 'cell_type']] = gene_type_distribution_split['tissue'].str.split(' ', n=1, expand=True)
gene_type_distribution_split  = gene_type_distribution_split.drop('tissue', axis=1)

In [None]:
display(gene_type_distribution_split)

In [None]:
#Clean up text
words_to_remove = ['cortex', 'temporal gyrus', 'zone']  # Add your specific words here

# Create a regex pattern from the words
pattern = '|'.join(r'\b{}\b'.format(word) for word in words_to_remove)

# Remove the words from the column
gene_type_distribution_split['cell_type'] = gene_type_distribution_split['cell_type'].str.replace(pattern, '', regex=True)

# Optional: Remove extra whitespace
gene_type_distribution_split['cell_type'] = gene_type_distribution_split['cell_type'].str.replace(r'\s+', ' ', regex=True).str.strip()

In [None]:
display(gene_type_distribution_split)

In [None]:
#Add abbreviations

region_abbreviations2 = {
    'Middle': 'MTG',
    'Subventricular': 'SVZ',
    'Putamen': 'PUT',
    'Entorhinal': 'EC'
}

# Replace the full names with abbreviations
gene_type_distribution_split['region'] = gene_type_distribution_split['region'].replace(region_abbreviations2)

In [None]:
#Group the data with multi-index--cell type, regions within
#Now have regional gene type counts within a cell type
grouped = gene_type_distribution_split.groupby(['cell_type', 'region']).sum()

In [None]:
display(grouped)

## Preparing Plot

In [None]:
desired_order = ['InN', 'ExN', 'SPN', 'OPC', 'Oligodendrocyte', 'Astrocyte', 'Microglia', 'Ependymal', 'Endothelial', 'Mural']  # Replace with your actual order
cell_types = pd.Index(desired_order)

In [None]:
#Make sure all regions displayed
all_regions = grouped.index.get_level_values('region').unique()

# Calculate the grid size
n_cell_types = len(cell_types)
n_cols = 3  # Adjust as needed
n_rows = (n_cell_types + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows), squeeze=False)
fig.suptitle('aDEG Types by Cell Type and Brain Region', fontsize=16)

custom_colors = {'protein_coding': '#B08F80', 'lncRNA': '#6C9E93'}
handles, labels = None, None

for i, cell_type in enumerate(cell_types):
    row = i // n_cols
    col = i % n_cols
    ax = axes[row, col]
    
    data = grouped.loc[cell_type]
    aligned_data = data.reindex(all_regions, fill_value=0)
    
    x = np.arange(len(all_regions))
    protein_coding = aligned_data['protein_coding'].values
    lncRNA = aligned_data['lncRNA'].values
    total = protein_coding + lncRNA
    
    bars1 = ax.bar(x, protein_coding, label='Protein Coding', color=custom_colors['protein_coding'])
    bars2 = ax.bar(x, lncRNA, bottom=protein_coding, label='lncRNA', color=custom_colors['lncRNA'])
    
    for idx, (pc, ln, tot) in enumerate(zip(protein_coding, lncRNA, total)):
        if tot > 0:
            pc_pct = f"{pc / tot * 100:.0f}%"
            ln_pct = f"{ln / tot * 100:.0f}%"
            ax.text(x[idx], pc / 2, pc_pct, ha='center', va='center', fontsize=9, color='black')
            ax.text(x[idx], pc + ln / 2, ln_pct, ha='center', va='center', fontsize=9, color='black')
    
    ax.set_title(cell_type)
    ax.set_xticks(x)
    ax.set_xticklabels(all_regions, rotation=45, ha='right')
    ax.set_xlabel('Region')         # X label on each subplot[2][4][7]
    ax.set_ylabel('Gene Count')      # Y label on each subplot[2][4][7]
    
    if handles is None and labels is None:
        handles, labels = ax.get_legend_handles_labels()

for i in range(n_cell_types, n_rows * n_cols):
    row = i // n_cols
    col = i % n_cols
    fig.delaxes(axes[row, col])

fig.legend(handles, labels, loc='upper center', ncol=2)

plt.tight_layout()
plt.subplots_adjust(top=0.88)  # Adjust to make room for suptitle and legend

plt.savefig('aDEG_Typing_Final_fixed.pdf')
plt.show()
