In [1]:
import pandas as pd
from scipy.stats import fisher_exact

In [3]:
# Null Hypothesis (H0): There is no association between being a brain gene and being under selection.
# Alternative Hypothesis (H1): Brain genes are more likely to be under selection than non-brain genes.

# test was conducted with the "greater" alternative, indicating that you are specifically testing
# whether brain genes are more likely to be under selection compared to non-brain genes.

In [19]:
# candidate genes found under positive selection by RELATE
selection = [
    "ACSL4", "AMMECR1", "ARSL", "BCOR", "CASK", "CDKL5", "CLCN4", "CLCN5", 
    "DMD", "ENOX2", "FAM120C", "FRMPD4", "G6PD", "GNL3L", "HUWE1", "IGSF1", 
    "IL1RAPL1", "LINC01278", "MAGT1", "MAMLD1", "MIR325HG", "NHS", "NYX", 
    "PAK3", "PASD1", "PHKA1", "PRKX", "PTCHD1", "PTCHD1-AS", "RAB33A", 
    "RAP2C-AS1", "RTL4", "TENM1", "TMEM164", "TMLHE", "WWC3", "XPNPEP2", 
    "ZMYM3", "ZNF185"
]

print(len(selection))

39


In [48]:
def read_google_sheet():
    SHEET_ID = '1JSjSLuto3jqdEnnG7JqzeC_1pUZw76n7XueVAYrUOpk'
    SHEET_NAME = 'Sheet1'
    url = f'https://docs.google.com/spreadsheets/d/{SHEET_ID}/gviz/tq?tqx=out:csv&sheet={SHEET_NAME}'
    df = pd.read_csv(url, header=1)
    return df.loc[:, [not x.startswith('Unnamed') for x in df.columns]]

def gene_list_names():
    df = read_google_sheet()
    return sorted(df.columns.tolist())

def gene_list(name):
    df = read_google_sheet()
    sr = df[name]
    return sr[~sr.isnull()]

# List of column names to be tested
columns_to_test = ['sfari_all_conf', 'intel_seiz_lang', 'intelect_disabil']

# Read the data from Google Sheets
df = read_google_sheet()

# Filter the DataFrame to include only the specified columns
df_filtered = df[columns_to_test]

# Concatenate the columns into a single Series
all_genes = pd.concat([df_filtered[col] for col in columns_to_test])

# Remove NaN values and get unique genes
unique_genes = all_genes.dropna().unique()

# Convert the array to a list for better readability
unique_genes_list = unique_genes.tolist()
unique_genes_list.sort()

# Get the number of unique genes
num_unique_genes = len(unique_genes_list)

# Print the number of unique genes and the sorted list of unique genes
print(f"Number of unique genes: {num_unique_genes}")

# Print the list of unique genes (ALL BRAIN GENES)
print(unique_genes_list)


Number of unique genes: 189
['ABCB7', 'ABCD1', 'ACSL4', 'AFF2', 'AGTR2', 'AIFM1', 'ALG13', 'AMER1', 'AMMECR1', 'ANOS1', 'AP1S2', 'AR', 'ARHGEF9', 'ARSL', 'ARX', 'ATP6AP1', 'ATP6AP2', 'ATP7A', 'ATRX', 'AVPR2', 'BCAP31', 'BCOR', 'BCORL1', 'BGN', 'BRWD3', 'BTK', 'CACNA1F', 'CASK', 'CCDC22', 'CD99L2', 'CDKL5', 'CHM', 'CLCN4', 'CLIC2', 'CNKSR2', 'COX7B', 'CUL4B', 'CXorf56', 'DCX', 'DDX3X', 'DDX53', 'DKC1', 'DLG3', 'DMD', 'EBP', 'EFNB1', 'EIF2S3', 'EMD', 'FAM47A', 'FAM50A', 'FANCB', 'FGD1', 'FGF13', 'FHL1', 'FLNA', 'FMR1', 'FRMPD4', 'FTSJ1', 'GABRA3', 'GATA1', 'GDI1', 'GJB1', 'GK', 'GLA', 'GLRA2', 'GPC3', 'GPC4', 'GPR101', 'GRIA3', 'HCCS', 'HCFC1', 'HDAC6', 'HDAC8', 'HMGB3', 'HNRNPH2', 'HPRT1', 'HS6ST2', 'HSD17B10', 'HUWE1', 'IDS', 'IGBP1', 'IGSF1', 'IKBKG', 'IL1RAPL1', 'IL1RAPL2', 'IL2RG', 'IQSEC2', 'KDM5C', 'KDM6A', 'KIF4A', 'KLHL15', 'L1CAM', 'LAGE3', 'LAMP2', 'LAS1L', 'MAGT1', 'MAOA', 'MAOB', 'MBTPS2', 'MECP2', 'MED12', 'MID1', 'MID2', 'MSL3', 'MTM1', 'NAA10', 'NDP', 'NDUFA1', 'NDUFB11',

In [40]:
# ASD + ID + Seizures + Language impairment

print(f"Genes in ASD + ID + Seizures + Language impairment: {len(unique_genes_list)}")

# Convert the list to a set
selection_set = set(selection)

# Find the intersection of the two sets
shared_genes_brain = selection_set.intersection(unique_genes_list)

# Print the number of shared genes
print(f"Number of shared genes: {len(shared_genes_brain)}")
print("Genes in brain genes under positive selection:")
for gene in shared_genes_brain:
    print(gene)

M = 1400 # All chrX genes
N = 189 # All brain genes
n = 39 # All genes under selection 
x = 18 # Brain genes under selection

          # Selection   # No selection            
table = [[  x,           n - x          ],  # Autism genes
         [ N - x,        M - (n + N) + x]]  # Non-autism genes
table, fisher_exact(table, alternative='greater').pvalue

Genes in ASD + ID + Seizures + Language impairment: 189
Number of shared genes: 18
Genes in brain genes under positive selection:
AMMECR1
TMLHE
BCOR
CDKL5
ARSL
CASK
PAK3
CLCN4
IGSF1
IL1RAPL1
ACSL4
MAGT1
DMD
PTCHD1-AS
HUWE1
FRMPD4
NHS
PTCHD1


([[18, 21], [171, 1190]], 4.843110163112196e-07)

In [45]:
# ASD

print(f"Genes in ASD: {len(df['sfari_all_conf'].dropna())}")

# Convert the list to a set
selection_set = set(selection)

# Find the intersection of the two sets
shared_genes_ASD = selection_set.intersection(df['sfari_all_conf'])

# Print the number of shared genes
print(f"Number of shared genes: {len(shared_genes_ASD)}")
print("Genes in ASD genes under positive selection:")
for gene in shared_genes_ASD:
    print(gene)

M = 1400 # All chrX genes
N = 77 # All autism genes
n = 39 # All genes under selection 
x = 10 # Autism genes under selection

          # Selection   # No selection            
table = [[  x,           n - x          ],  # Autism genes
         [ N - x,        M - (n + N) + x]]  # Non-autism genes
table, fisher_exact(table, alternative='greater').pvalue

Genes in ASD: 77
Number of shared genes: 10
Genes in ASD genes under positive selection:
TMLHE
CDKL5
CASK
CLCN4
IL1RAPL1
PTCHD1-AS
DMD
HUWE1
FRMPD4
PTCHD1


([[10, 29], [67, 1294]], 2.4526256107138398e-05)

In [47]:
# ID

print(f"Genes in ID: {len(df['intelect_disabil'].dropna())}")

# Convert the list to a set
selection_set = set(selection)

# Find the intersection of the two sets
shared_genes_ID = selection_set.intersection(df['intelect_disabil'])

# Print the number of shared genes
print(f"Number of shared genes: {len(shared_genes_ID)}")
print("Genes in ID genes under positive selection:")
for gene in shared_genes_ID:
    print(gene)

M = 1400 # All chrX genes
N = 129 # All ID genes
n = 39 # All genes under selection 
x = 14 # Autism genes under selection

          # Selection   # No selection            
table = [[  x,           n - x          ],  # Autism genes
         [ N - x,        M - (n + N) + x]]  # Non-autism genes
table, fisher_exact(table, alternative='greater').pvalue

Genes in ID: 129
Number of shared genes: 14
Genes in ID genes under positive selection:
TMLHE
BCOR
CDKL5
CASK
PAK3
CLCN4
IL1RAPL1
ACSL4
MAGT1
DMD
HUWE1
FRMPD4
NHS
PTCHD1


([[14, 25], [115, 1246]], 3.2631162171983967e-06)

In [14]:
gene_list_names()

['accel_reg_simiiformes_br',
 'all_npx',
 'all_post_mei_expr',
 'ari_relate_AFR',
 'ari_relate_ASIA',
 'ari_relate_EUR',
 'cDEG',
 'candidates',
 'ech75_regions',
 'ech90_regions',
 'expr_mod_xi_copynr_fibrobl',
 'expr_mod_xi_copynr_lcl',
 'gametologs',
 'intel_seiz_lang',
 'intelect_disabil',
 'matos_common',
 'matos_common_no_xy_gametologs',
 'matos_neuron',
 'msci',
 'msci_esc',
 'my_primate_codeml',
 'nDEG',
 'neuron_genome_proteome',
 'neuron_npx_proteome',
 'primate_ampl_multi',
 'reg_sa_pheno',
 'sfari_all_conf',
 'xi',
 'xi_any_evidence',
 'xi_escape',
 'xi_uncertain']

In [37]:
# ECH 75%

print(f"Genes in ECH 75%: {len(df['ech75_regions'].dropna())}")

# Convert the list to a set
selection_set = set(selection)

# Convert df['ech75_regions'] to a set, dropping any NaN values
ech75_set = set(df['ech75_regions'].dropna())

# Find the intersection of the two sets
shared_genes_75 = selection_set.intersection(ech75_set)

# Print the number of shared genes
print(f"Number of shared genes: {len(shared_genes_75)}")
print("Genes in ECH 75% under positive selection:")
for gene in shared_genes_75:
    print(gene)


M = 1400 # All chrX genes
N = 120 # All ECH 75% genes
n = 39 # All genes under selection 
x = 8 # ECH 75% genes under selection

          # Selection   # No selection            
table = [[  x,           n - x          ],  # ECH 75% genes
         [ N - x,        M - (n + N) + x]]  # Non ECH 75% genes
table, fisher_exact(table, alternative='greater').pvalue

Genes in ECH 75%: 120
Number of shared genes: 8
Genes in ECH 75% under positive selection:
AMMECR1
ENOX2
RAP2C-AS1
FAM120C
PAK3
TMEM164
MAGT1
CLCN5


([[8, 31], [112, 1249]], 0.014691099447761497)

In [36]:
# ECH 90%

print(f"Genes in ECH 90%: {len(df['ech90_regions'].dropna())}")

# Convert the list to a set
selection_set = set(selection)

# Convert df['ech75_regions'] to a set, dropping any NaN values
ech90_set = set(df['ech90_regions'].dropna())

# Find the intersection of the two sets
shared_genes_90 = selection_set.intersection(ech90_set)

# Print the number of shared genes
print(f"Number of shared genes: {len(shared_genes_90)}")
# Print the list of shared genes
print("Genes in ECH 90% under positive selection:")
for gene in shared_genes_90:
    print(gene)

M = 1400 # All chrX genes
N = 94 # All ECH 90% genes
n = 39 # All genes under selection 
x = 6 # ECH 90% genes under selection

          # Selection   # No selection            
table = [[  x,           n - x          ],  # ECH 90% genes
         [ N - x,        M - (n + N) + x]]  # Non ECH 90% genes
table, fisher_exact(table, alternative='greater').pvalue


Genes in ECH 90%: 94
Number of shared genes: 6
Genes in ECH 90% under positive selection:
ENOX2
RAP2C-AS1
FAM120C
PAK3
MAGT1
CLCN5


([[6, 33], [88, 1273]], 0.04189018794613577)