In [1]:
import pandas as pd
from scipy.stats import fisher_exact

def read_google_sheet():
    SHEET_ID = '1JSjSLuto3jqdEnnG7JqzeC_1pUZw76n7XueVAYrUOpk'
    SHEET_NAME = 'Sheet1'
    url = f'https://docs.google.com/spreadsheets/d/{SHEET_ID}/gviz/tq?tqx=out:csv&sheet={SHEET_NAME}'
    df = pd.read_csv(url, header=1)
    return df.loc[:, [not x.startswith('Unnamed') for x in df.columns]]

def gene_list_names():
    df = read_google_sheet()
    return sorted(df.columns.tolist())

def gene_list(name):
    df = read_google_sheet()
    sr = df[name]
    return sr[~sr.isnull()]

# List of column names to be tested
columns_to_test = ['sfari_all_conf', 'intel_seiz_lang', 'intelect_disabil']

# Read the data from Google Sheets
df = read_google_sheet()

# Filter the DataFrame to include only the specified columns
df_filtered = df[columns_to_test]

# Concatenate the columns into a single Series
all_genes = pd.concat([df_filtered[col] for col in columns_to_test])

# Remove NaN values and get unique genes
unique_genes = all_genes.dropna().unique()

# Convert the array to a list for better readability
unique_genes_list = unique_genes.tolist()
unique_genes_list.sort()

# Get the number of unique genes
num_unique_genes = len(unique_genes_list)

# Print the number of unique genes and the sorted list of unique genes
print(f"Number of unique genes: {num_unique_genes}")

# Print the list of unique genes
print(unique_genes_list)


Number of unique genes: 189
['ABCB7', 'ABCD1', 'ACSL4', 'AFF2', 'AGTR2', 'AIFM1', 'ALG13', 'AMER1', 'AMMECR1', 'ANOS1', 'AP1S2', 'AR', 'ARHGEF9', 'ARSL', 'ARX', 'ATP6AP1', 'ATP6AP2', 'ATP7A', 'ATRX', 'AVPR2', 'BCAP31', 'BCOR', 'BCORL1', 'BGN', 'BRWD3', 'BTK', 'CACNA1F', 'CASK', 'CCDC22', 'CD99L2', 'CDKL5', 'CHM', 'CLCN4', 'CLIC2', 'CNKSR2', 'COX7B', 'CUL4B', 'CXorf56', 'DCX', 'DDX3X', 'DDX53', 'DKC1', 'DLG3', 'DMD', 'EBP', 'EFNB1', 'EIF2S3', 'EMD', 'FAM47A', 'FAM50A', 'FANCB', 'FGD1', 'FGF13', 'FHL1', 'FLNA', 'FMR1', 'FRMPD4', 'FTSJ1', 'GABRA3', 'GATA1', 'GDI1', 'GJB1', 'GK', 'GLA', 'GLRA2', 'GPC3', 'GPC4', 'GPR101', 'GRIA3', 'HCCS', 'HCFC1', 'HDAC6', 'HDAC8', 'HMGB3', 'HNRNPH2', 'HPRT1', 'HS6ST2', 'HSD17B10', 'HUWE1', 'IDS', 'IGBP1', 'IGSF1', 'IKBKG', 'IL1RAPL1', 'IL1RAPL2', 'IL2RG', 'IQSEC2', 'KDM5C', 'KDM6A', 'KIF4A', 'KLHL15', 'L1CAM', 'LAGE3', 'LAMP2', 'LAS1L', 'MAGT1', 'MAOA', 'MAOB', 'MBTPS2', 'MECP2', 'MED12', 'MID1', 'MID2', 'MSL3', 'MTM1', 'NAA10', 'NDP', 'NDUFA1', 'NDUFB11',

In [3]:
# ASD + ID + Seizures + Language impair

M = 1400 # All chrX genes
N = 189 # All brain genes
n = 39 # All genes under selection 
x = 18 # Brain genes under selection

          # Selection   # No selection            
table = [[  x,           n - x          ],  # Autism genes
         [ N - x,        M - (n + N) + x]]  # Non-autism genes
table, fisher_exact(table, alternative='greater').pvalue

([[18, 21], [171, 1190]], 4.843110163112196e-07)

In [4]:
# ASD

M = 1400 # All chrX genes
N = 77 # All autism genes
n = 39 # All genes under selection 
x = 10 # Autism genes under selection

          # Selection   # No selection            
table = [[  x,           n - x          ],  # Autism genes
         [ N - x,        M - (n + N) + x]]  # Non-autism genes
table, fisher_exact(table, alternative='greater').pvalue

([[10, 29], [67, 1294]], 2.4526256107138398e-05)

In [5]:
# ID

M = 1400 # All chrX genes
N = 127 # All ID genes
n = 39 # All genes under selection 
x = 14 # Autism genes under selection

          # Selection   # No selection            
table = [[  x,           n - x          ],  # Autism genes
         [ N - x,        M - (n + N) + x]]  # Non-autism genes
table, fisher_exact(table, alternative='greater').pvalue

([[14, 25], [113, 1248]], 2.686813117828579e-06)