In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import os.path

In [11]:
PATH = "./data/"
PATH_MY_DGE = "./data/DESeq_results/"

# LRT results
LRT_NEURON_TYPE = "LRT/neuron_type/DESeq_output.xlsx"
LRT_TREATMENT = "LRT/treatment/DESeq_output.xlsx"
LRT_INTERACTION = "LRT/interaction/DESeq_output.xlsx"


MEAN_DAL = "mean/DAL/res_mean.xlsx"
MEAN_V2 = "mean/V2/res_mean.xlsx"
MEAN_V3 = "mean/V3/res_mean.xlsx"
MEAN_AB_KCs = "mean/AB_KCs/res_mean.xlsx"
MEAN_G_KCs = "mean/G_KCs/res_mean.xlsx"
MEAN_R27 = "mean/R27/res_mean.xlsx"
MEAN_G386 = "mean/G386/res_mean.xlsx"

PAIRWISE_DAL = "pairwise/res_DAL_"
PAIRWISE_V2 = "pairwise/res_V2_"
PAIRWISE_V3 = "pairwise/res_V3_"
PAIRWISE_AB_KCs = "pairwise/res_AB_KCs_"
PAIRWISE_G_KCs = "pairwise/res_G_KCs_"
PAIRWISE_R27 = "pairwise/res_R27_"
PAIRWISE_G386 = "pairwise/res_G386_"

DGE_PAPER = "./data/NIHMS780544-supplement-5.xlsx"

COGNITION_GENE_NAMES = "FlyBase_Fields_download.txt"

In [7]:
NB_COGNITION_GENES = 137
NB_GENES = 15682

Load names of genes related to cognition:

In [8]:
# Names of genes related to cognition from FlyBase (names converted on flybase.org directly)
cognition_genes = pd.read_csv(PATH + COGNITION_GENE_NAMES, delimiter = '\t')
#  Store these gene names in a list
cognition_gene_names = list(cognition_genes['SYMBOL'])

Utilitary functions:

In [9]:
def compute_contingency_table(results_table):
    nb_sig = results_table.shape[0]

    results_table_cognition = results_table[results_table['gene'].isin(cognition_gene_names)].copy()
    nb_cognition_sig = results_table_cognition.shape[0]
    nb_cognition_non_sig = NB_COGNITION_GENES - nb_cognition_sig

    results_table_non_cognition = results_table[~results_table['gene'].isin(cognition_gene_names)].copy()
    nb_non_cognition_sig = nb_sig - nb_cognition_sig
    nb_non_cognition_non_sig = NB_GENES - NB_COGNITION_GENES - nb_non_cognition_sig

    contingency_table = pd.DataFrame()
    contingency_table['names'] = ['cognition', 'non-cognition']
    contingency_table['significant'] = [nb_cognition_sig, nb_non_cognition_sig]
    contingency_table['non-significant'] = [nb_cognition_non_sig, nb_non_cognition_non_sig]
    contingency_table.set_index('names', inplace=True)

    return contingency_table

In [10]:
def compute_odds_ratio(contingency_table):
    odd_numerator = contingency_table.iloc[0,0] / contingency_table.iloc[1,0]
    odd_denominator = contingency_table.iloc[0,1] / contingency_table.iloc[1,1]
    odds_ratio = odd_numerator / odd_denominator
    return odds_ratio

In [29]:
def top_list_genes(results_table):
    results_table.sort_values(by=['padj'], inplace=True)
    top_20 = results_table.head(20).copy()
    top_cognition = top_20[top_20['gene'].isin(cognition_gene_names)].copy()
    ratio_cognition = top_cognition.shape[0]/20
    return top_20, top_cognition, ratio_cognition

# 1) LRT TESTS

## a) Effect of neuron_type

In [42]:
# Load DESeq results
genes_LRT_neuron_type = pd.read_excel(PATH_MY_DGE + LRT_NEURON_TYPE)
genes_LRT_neuron_type

Unnamed: 0,gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,40512,15.273559,19.183609,2.879341,17.970839,6.305485e-03,1.751580e-02
1,128up,166.372302,0.759255,0.678395,22.132593,1.145669e-03,3.903124e-03
2,14-3-3epsilon,11584.976852,0.059121,0.172566,20.264403,2.484639e-03,7.696410e-03
3,14-3-3zeta,58800.016238,-1.867902,0.165862,383.209783,1.135976e-79,2.372373e-76
4,2mit,5687.472251,-2.184018,0.308143,134.197652,1.677618e-26,7.268750e-25
...,...,...,...,...,...,...,...
4430,zfh2,1369.012103,-0.957744,0.441119,62.189454,1.614323e-11,1.765106e-10
4431,ZnT63C,564.227619,-1.801210,0.484022,29.477366,4.940009e-05,2.286506e-04
4432,zormin,38.535995,0.749091,0.950265,27.286527,1.279595e-04,5.418300e-04
4433,Zw,58.897987,2.226988,1.447630,27.129564,1.369338e-04,5.770227e-04


In [20]:
# Build contigency table of genes significant genes vs genes related to cognition
contingency_table_LRT_neuron_type = compute_contingency_table(genes_LRT_neuron_type)
print(contingency_table_LRT_neuron_type)
# Compute odds ratios of significance for the condition "gene related to cognition"
odds_ratio_LRT_neuron_type = compute_odds_ratio(contingency_table_LRT_neuron_type)
print("Odds ratio for LRT neuron_type:  %.2f" % odds_ratio_LRT_neuron_type)

               significant  non-significant
names                                      
cognition               81               56
non-cognition         4354            11191
Odds ratio for LRT neuron_type:  3.72


In [35]:
# Plot the 20 most significant genes, the genes related to cognition that belong to the top significant genes, and a ratio of cognition and non-cognition genes in the top 20
top_20_LRT_neuron_type, top_cognition_LRT_neuron_type, ratio_LRT_neuron_type = top_list_genes(genes_LRT_neuron_type)
print("TOP 20 SIGNIFICANT: ", top_20_LRT_neuron_type, "\n")
print("TOP COGNITION: ", top_cognition_LRT_neuron_type, "\n")
print("Ratio cognition vs non-cognition in top 20:  %.2f" % ratio_LRT_neuron_type, "\n")

TOP 20 SIGNIFICANT:              gene      baseMean  log2FoldChange     lfcSE        stat  \
3582      Pka-C1  13758.814627       -3.048991  0.233290  617.644695   
3314         mub  24535.829534       -3.540036  0.272025  600.362465   
3413       Nplp1  10440.536927        2.587029  0.452205  487.421839   
4346       VGlut  25766.143689        2.620154  0.329602  396.646933   
3     14-3-3zeta  58800.016238       -1.867902  0.165862  383.209783   
109         Appl  24678.701098       -2.092339  0.206516  372.147871   
2640        dysc  14458.194629       -2.465205  0.241575  370.808076   
1728      CG4577  20973.006814       -1.431690  0.349717  359.050047   
3022         jdp  21487.406914       -4.144634  0.322294  341.926309   
2792      Gapdh1   2868.651287        2.125477  0.235256  339.976029   
3204        Mef2   3490.889867       -3.868042  0.334064  329.638012   
3584      Pka-R2  13133.991016       -3.172450  0.335258  319.827760   
2585         dnc  16732.863922       -1.734

## b) Effect of treatment

In [36]:
# Load DESeq results
genes_LRT_treatment = pd.read_excel(PATH_MY_DGE + LRT_TREATMENT)
genes_LRT_treatment.head()

Unnamed: 0,gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj


No significant genes (the effect of treatment is probably masked by the really bigger effect of the neuron_type).

## c) Effect of the interaction treatment:neuron_type

In [41]:
# Load DESeq results
genes_LRT_interaction = pd.read_excel(PATH_MY_DGE + LRT_INTERACTION)
genes_LRT_interaction

Unnamed: 0,gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,a6,44.574190,-1.141517,1.853339,20.212179,0.002539,0.045343
1,Aats-ser,63.729640,-0.293710,2.091632,21.838556,0.001295,0.029220
2,ACC,801.375292,-1.416194,0.671115,26.427018,0.000185,0.009300
3,Acox57D-d,43.262937,-0.416476,1.265613,20.099821,0.002658,0.046108
4,Act57B,645.529292,-3.305572,2.015065,26.595129,0.000172,0.008939
...,...,...,...,...,...,...,...
518,WDR79,50.512737,-7.919750,2.635935,24.707558,0.000387,0.014720
519,yemalpha,32.985791,-5.425059,1.719368,21.809822,0.001311,0.029374
520,Yp1,60103.840084,-3.028913,1.168891,21.688319,0.001379,0.030400
521,Yp3,55458.193915,-2.714578,1.146160,26.357185,0.000191,0.009529


In [39]:
# Build contigency table of genes significant genes vs genes related to cognition
contingency_table_LRT_interaction = compute_contingency_table(genes_LRT_interaction)
print(contingency_table_LRT_interaction)
# Compute odds ratios of significance for the condition "gene related to cognition"
odds_ratio_LRT_interaction = compute_odds_ratio(contingency_table_LRT_interaction)
print("Odds ratio for LRT interaction:  %.2f" % odds_ratio_LRT_interaction)

               significant  non-significant
names                                      
cognition                6              131
non-cognition          517            15028
Odds ratio for LRT interaction:  1.33


In [40]:
# Plot the 20 most significant genes, the genes related to cognition that belong to the top significant genes, and a ratio of cognition and non-cognition genes in the top 20
top_20_LRT_interaction, top_cognition_LRT_interaction, ratio_LRT_interaction = top_list_genes(genes_LRT_interaction)
print("TOP 20 SIGNIFICANT: ", top_20_LRT_interaction, "\n")
print("TOP COGNITION: ", top_cognition_LRT_interaction, "\n")
print("Ratio cognition vs non-cognition in top 20:  %.2f" % ratio_LRT_interaction, "\n")

TOP 20 SIGNIFICANT:          gene    baseMean  log2FoldChange     lfcSE       stat        pvalue  \
13     Bap55  211.606972        1.735826  1.411814  68.215304  9.492066e-13   
434   PDCD-5  329.533022       -0.293894  0.782156  59.543504  5.572180e-11   
467    Sfmbt  175.519200        1.275570  1.198552  58.374416  9.621635e-11   
165  CG33228   70.345716       -0.293695  1.298651  58.036505  1.126563e-10   
507   Topors  109.047384       -1.478591  1.618646  53.212079  1.063388e-09   
157  CG32262  128.365673       -0.426489  1.624291  49.950597  4.809500e-09   
106  CG15432  205.536312        1.272443  1.122446  49.238765  6.678926e-09   
256   CG6867  188.936717        0.844858  1.536640  47.883475  1.246684e-08   
352     frtz  147.790531        0.133079  1.200585  47.863459  1.258215e-08   
514      uri   46.629745       -2.519746  1.760530  46.992643  1.877466e-08   
269   CG7382  429.362607        0.065773  0.876162  45.372730  3.946181e-08   
199  CG42666  123.774550       

So now we have seen that the main effect explaining differences in gene expression profiles between cell types is the neuron type. The effect of the treatment is neglectable compared to the effect of neuron_type when considering all the cell types together, as no significant genes show up from the DESeq analysis. There seems to be an interaction between treatment and neuron_type for some genes, but which concerns a substantially lower number of genes (only 523) compared to the genes concerned by the effect of neuron_type alone (4425 significant genes for the effect of neuron_type alone).

Therefore, we will now focus on the gene expression differences between the different cell types and the mean of other cell types.


# 2) Tests neuron_type vs mean(of other neuron types)

## a) DAL vs mean:

In [55]:
genes_mean_DAL = pd.read_excel(PATH_MY_DGE + MEAN_DAL)
genes_mean_DAL.head()

Unnamed: 0,gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,14-3-3zeta,58800.016238,-0.507378,0.126177,-4.021153,5.791394e-05,0.00187989
1,2mit,5687.472251,-1.099488,0.233955,-4.699559,2.607234e-06,0.0001892107
2,5-HT1A,1726.547143,-1.928913,0.316882,-6.087171,1.149232e-09,2.538304e-07
3,5-HT1B,633.144348,-1.19975,0.36761,-3.263653,0.001099857,0.01653039
4,7B2,4951.678226,1.178411,0.166558,7.075088,1.493544e-12,6.597567e-10


In [56]:
contingency_table_mean_DAL = compute_contingency_table(genes_mean_DAL)
print(contingency_table_mean_DAL)
odds_ratio_mean_DAL = compute_odds_ratio(contingency_table_mean_DAL)
print("Odds ration for DAL vs mean:  %.2f" % odds_ratio_mean_DAL)

               significant  non-significant
names                                      
cognition               29              108
non-cognition         1034            14511
Odds ration for DAL vs mean:  3.77


In [57]:
top_20_mean_DAL, top_cognition_mean_DAL, ratio_mean_DAL = top_list_genes(genes_mean_DAL)
print("TOP 20 SIGNIFICANT: ", top_20_mean_DAL, "\n")
print("TOP COGNITION: ", top_cognition_mean_DAL, "\n")
print("Ratio cognition vs non-cognition in top 20:  %.2f" % ratio_mean_DAL, "\n")

TOP 20 SIGNIFICANT:           gene      baseMean  log2FoldChange     lfcSE       stat  \
800     Nplp1  10440.536927        5.733408  0.335161  17.106415   
390    CG4577  20973.006814        3.832801  0.269309  14.231979   
239    CG2444    199.232838        6.739462  0.577418  11.671725   
185   CG15863   2336.267074        6.632958  0.671931   9.871485   
975    SP1029    290.517613        6.557808  0.701299   9.350951   
1018  Trissin    686.665863        8.384197  0.914951   9.163552   
82    CG10970    232.246193        4.455296  0.508288   8.765296   
714       itp   4359.315416        6.451635  0.749585   8.606939   
27     Amyrel     46.635881        4.528883  0.529036   8.560638   
823      Pal1    529.381029        2.562674  0.305551   8.387044   
609       drl    715.326395       -2.504625  0.304543  -8.224216   
832       Pdi   2430.216105        1.610445  0.196091   8.212751   
333   CG34370   2639.691704       -2.270765  0.279549  -8.122964   
773       mub  24535.829534

## b) V2 vs mean:

In [58]:
genes_mean_V2 = pd.read_excel(PATH_MY_DGE + MEAN_V2)
genes_mean_V2.head()

Unnamed: 0,gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,7SLRNA:CR32864,110.742163,-1.559759,0.380226,-4.102188,4.092616e-05,0.001619
1,7SLRNA:CR42652,112.44358,-1.479095,0.377617,-3.916916,8.968887e-05,0.003034
2,a10,199.626819,-5.274168,0.969823,-5.438278,5.379805e-08,6e-06
3,abba,40.742169,-3.212424,1.020721,-3.147212,0.001648356,0.026679
4,Ace,1058.38361,1.086597,0.347192,3.129675,0.001749999,0.027778


In [59]:
contingency_table_mean_V2 = compute_contingency_table(genes_mean_V2)
print(contingency_table_mean_V2)
odds_ratio_mean_V2 = compute_odds_ratio(contingency_table_mean_V2)
print("Odds ration for V2 vs mean:  %.2f" % odds_ratio_mean_V2)

               significant  non-significant
names                                      
cognition               10              127
non-cognition          793            14752
Odds ration for V2 vs mean:  1.46


In [60]:
top_20_mean_V2, top_cognition_mean_V2, ratio_mean_V2 = top_list_genes(genes_mean_V2)
print("TOP 20 SIGNIFICANT: ", top_20_mean_V2, "\n")
print("TOP COGNITION: ", top_cognition_mean_V2, "\n")
print("Ratio cognition vs non-cognition in top 20:  %.2f" % ratio_mean_V2, "\n")

TOP 20 SIGNIFICANT:          gene      baseMean  log2FoldChange     lfcSE      stat        pvalue  \
581   Pbprp2   1354.026651       -5.628119  0.684161 -8.226308  1.930717e-16   
229  CG33970     66.545035       -5.481426  0.715713 -7.658697  1.878284e-14   
663     RpL5  10906.993122       -1.478961  0.208290 -7.100478  1.243262e-12   
379  CR31451  39147.117104        1.252581  0.179055  6.995521  2.642744e-12   
427   eIF-5A   6421.676813       -1.510687  0.217913 -6.932520  4.134095e-12   
572      opa     22.200108        5.789115  0.835475  6.929132  4.234313e-12   
417      Drs   3066.622796       -4.189367  0.611014 -6.856422  7.060662e-12   
515     Lim1   1291.589471        2.954600  0.437198  6.758042  1.398689e-11   
693   RpS27A  11271.320766       -1.621930  0.240172 -6.753196  1.446232e-11   
656   RpL37A   9876.683863       -1.503450  0.228722 -6.573262  4.922475e-11   
625    RpL10   8885.102868       -1.381040  0.211417 -6.532318  6.475954e-11   
697    RpS30   6965

## c) V3 vs mean:

In [61]:
genes_mean_V3 = pd.read_excel(PATH_MY_DGE + MEAN_V3)#, index_col=0)
genes_mean_V3.head()

Unnamed: 0,gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,14-3-3zeta,58800.016238,-0.692018,0.126181,-5.484307,4.150941e-08,9.01965e-06
1,5-HT1A,1726.547143,-1.291459,0.31671,-4.077728,4.547793e-05,0.002620634
2,a10,199.626819,5.48777,0.905319,6.061698,1.346916e-09,5.403207e-07
3,a5,10.039765,3.789862,1.140781,3.322166,0.0008932152,0.02281861
4,Ac78C,289.484002,-1.667509,0.504667,-3.304175,0.0009525625,0.02394031


In [62]:
contingency_table_mean_V3 = compute_contingency_table(genes_mean_V3)
print(contingency_table_mean_V3)
odds_ratio_mean_V3 = compute_odds_ratio(contingency_table_mean_V3)
print("Odds ration for V3 vs mean:  %.2f" % odds_ratio_mean_V3)

               significant  non-significant
names                                      
cognition               19              118
non-cognition          544            15001
Odds ration for V3 vs mean:  4.44


In [63]:
top_20_mean_V3, top_cognition_mean_V3, ratio_mean_V3 = top_list_genes(genes_mean_V3)
print("TOP 20 SIGNIFICANT: ", top_20_mean_V3, "\n")
print("TOP COGNITION: ", top_cognition_mean_V3, "\n")
print("Ratio cognition vs non-cognition in top 20:  %.2f" % ratio_mean_V3, "\n")

TOP 20 SIGNIFICANT:          gene      baseMean  log2FoldChange     lfcSE       stat        pvalue  \
301      dac   2019.257512       -5.999757  0.474046 -12.656492  1.030004e-36   
331      ems     50.161588       10.365422  0.848488  12.216349  2.542487e-34   
557     Vmat  15438.206106        6.841333  0.646564  10.581054  3.648323e-26   
156  CG32532    502.834618       -5.821082  0.605604  -9.612026  7.113522e-22   
136  CG30127    907.100136       -3.333472  0.356536  -9.349612  8.797011e-21   
537      Tbh    517.964119        6.488039  0.731205   8.873080  7.115047e-19   
387      jdp  21487.406914       -1.999472  0.242516  -8.244712  1.655589e-16   
527     sNPF   8920.430653       -4.551175  0.566257  -8.037298  9.184085e-16   
131   CG2269  14567.284896        1.843363  0.243257   7.577835  3.513697e-14   
65    CG1275   3179.437529        1.925265  0.263774   7.298919  2.900882e-13   
207   CG4577  20973.006814       -1.962925  0.269421  -7.285723  3.199491e-13   
389   J

## d) AB_KCs vs mean:

In [64]:
genes_mean_AB_KCs = pd.read_excel(PATH_MY_DGE + MEAN_AB_KCs)
genes_mean_AB_KCs.head()

Unnamed: 0,gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,14-3-3zeta,58800.016238,1.477044,0.126164,11.707375,1.168416e-31,1.13655e-28
1,26-29-p,430.720199,-0.946367,0.362864,-2.608047,0.009106055,0.04540297
2,2mit,5687.472251,1.987936,0.233818,8.502063,1.8625090000000003e-17,3.496289e-15
3,4EHP,680.858997,1.289395,0.347747,3.70785,0.0002090267,0.001981032
4,5-HT1A,1726.547143,1.284474,0.316358,4.060188,4.903332e-05,0.0005557802


In [65]:
contingency_table_mean_AB_KCs = compute_contingency_table(genes_mean_AB_KCs)
print(contingency_table_mean_AB_KCs)
odds_ratio_mean_AB_KCs = compute_odds_ratio(contingency_table_mean_AB_KCs)
print("Odds ration for AB_KCs vs mean:  %.2f" % odds_ratio_mean_AB_KCs)

               significant  non-significant
names                                      
cognition               56               81
non-cognition         2141            13404
Odds ration for AB_KCs vs mean:  4.33


In [66]:
top_20_mean_AB_KCs, top_cognition_mean_AB_KCs, ratio_mean_AB_KCs = top_list_genes(genes_mean_AB_KCs)
print("TOP 20 SIGNIFICANT: ", top_20_mean_AB_KCs, "\n")
print("TOP COGNITION: ", top_cognition_mean_AB_KCs, "\n")
print("Ratio cognition vs non-cognition in top 20:  %.2f" % ratio_mean_AB_KCs, "\n")

TOP 20 SIGNIFICANT:              gene      baseMean  log2FoldChange     lfcSE       stat  \
1734      Pka-C1  13758.814627        2.743207  0.178530  15.365566   
1591         mub  24535.829534        3.140968  0.205253  15.302918   
1287      Eip93F   9992.995895        3.185652  0.234390  13.591245   
52          Appl  24678.701098        2.097919  0.156448  13.409709   
2157       VGlut  25766.143689       -3.213417  0.247309 -12.993548   
1264        dysc  14458.194629        2.326421  0.182048  12.779162   
1232        dlg1   7179.380820        2.157975  0.170041  12.690913   
1808         Rdl  26184.966497        2.281387  0.182023  12.533498   
1804         Rbp   6035.953383        2.485934  0.198814  12.503832   
788      CG42784   6268.198698        2.850054  0.232752  12.245045   
0     14-3-3zeta  58800.016238        1.477044  0.126164  11.707375   
1235         dnc  16732.863922        1.736376  0.150082  11.569542   
1471         jdp  21487.406914        2.795227  0.242402

## e) G_KCs vs mean:

In [67]:
genes_mean_G_KCs = pd.read_excel(PATH_MY_DGE + MEAN_G_KCs)
genes_mean_G_KCs.head()

Unnamed: 0,gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,14-3-3zeta,58800.016238,1.414578,0.117141,12.075836,1.4171660000000001e-33,7.2208e-31
1,2mit,5687.472251,1.027394,0.217334,4.727267,2.275622e-06,2.86798e-05
2,4EHP,680.858997,0.904951,0.323782,2.794943,0.005190884,0.02791078
3,5-HT1B,633.144348,1.306842,0.341676,3.824803,0.0001308765,0.001140373
4,7SLRNA:CR32864,110.742163,-1.667503,0.354855,-4.699113,2.612941e-06,3.258562e-05


In [68]:
contingency_table_mean_G_KCs = compute_contingency_table(genes_mean_G_KCs)
print(contingency_table_mean_G_KCs)
odds_ratio_mean_G_KCs = compute_odds_ratio(contingency_table_mean_G_KCs)
print("Odds ration for G_KCs vs mean:  %.2f" % odds_ratio_mean_G_KCs)

               significant  non-significant
names                                      
cognition               53               84
non-cognition         2174            13371
Odds ration for G_KCs vs mean:  3.88


In [69]:
top_20_mean_G_KCs, top_cognition_mean_G_KCs, ratio_mean_G_KCs = top_list_genes(genes_mean_G_KCs)
print("TOP 20 SIGNIFICANT: ", top_20_mean_G_KCs, "\n")
print("TOP COGNITION: ", top_cognition_mean_G_KCs, "\n")
print("Ratio cognition vs non-cognition in top 20:  %.2f" % ratio_mean_G_KCs, "\n")

TOP 20 SIGNIFICANT:           gene      baseMean  log2FoldChange     lfcSE       stat  \
2170      Ubx   3622.856686       -8.909739  0.376939 -23.637062   
1471     IM23   3377.318631       -7.344916  0.460354 -15.954926   
348   CG15065   1647.193271       -6.157349  0.388672 -15.842032   
1631      mub  24535.829534        2.967761  0.190712  15.561450   
1780   Pka-C1  13758.814627        2.526723  0.165835  15.236334   
1467      IM1   4288.370133       -6.625489  0.438881 -15.096321   
1470      IM2  10813.385749       -6.148889  0.412281 -14.914304   
1469     IM14   7217.546238       -5.852901  0.395753 -14.789283   
1472      IM3  14636.066873       -6.237948  0.427355 -14.596660   
885    CG5773   1278.240366       -7.011064  0.491767 -14.256888   
1473      IM4   6014.124213       -5.817683  0.417721 -13.927200   
448   CG18107   2456.373766       -5.928098  0.443505 -13.366482   
349   CG15067   8907.535992       -5.927210  0.449496 -13.186355   
887    CG5791   2349.259034

## f) R27 vs mean:

In [70]:
genes_mean_R27 = pd.read_excel(PATH_MY_DGE + MEAN_R27)
genes_mean_R27.head()

Unnamed: 0,gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,140up,103.708171,-2.133845,0.721236,-2.958593,0.003090469,0.01465787
1,14-3-3epsilon,11584.976852,-0.372036,0.144318,-2.577886,0.009940669,0.03719382
2,14-3-3zeta,58800.016238,-0.959431,0.138606,-6.921993,4.453334e-12,2.955639e-10
3,4EHP,680.858997,-0.932889,0.381164,-2.447475,0.01438613,0.04985397
4,5-HT1A,1726.547143,1.188162,0.346324,3.430785,0.0006018368,0.00373842


In [71]:
contingency_table_mean_R27 = compute_contingency_table(genes_mean_R27)
print(contingency_table_mean_R27)
odds_ratio_mean_R27 = compute_odds_ratio(contingency_table_mean_R27)
print("Odds ration for R27 vs mean:  %.2f" % odds_ratio_mean_R27)

               significant  non-significant
names                                      
cognition               60               77
non-cognition         2796            12749
Odds ration for R27 vs mean:  3.55


In [72]:
top_20_mean_R27, top_cognition_mean_R27, ratio_mean_R27 = top_list_genes(genes_mean_R27)
print("TOP 20 SIGNIFICANT: ", top_20_mean_R27, "\n")
print("TOP COGNITION: ", top_cognition_mean_R27, "\n")
print("Ratio cognition vs non-cognition in top 20:  %.2f" % ratio_mean_R27, "\n")

TOP 20 SIGNIFICANT:           gene      baseMean  log2FoldChange     lfcSE       stat  \
2799    VGlut  25766.143689        3.076564  0.270912  11.356326   
802   CG31221  26621.149012       -3.902386  0.390656  -9.989311   
2122      mub  24535.829534       -2.220923  0.225435  -9.851714   
792    CG3104    179.477070       -8.005783  0.823363  -9.723273   
2115    mthl8    322.598079       -7.162813  0.740210  -9.676735   
2392    rho-7    146.265426       -6.602217  0.688769  -9.585526   
2228      pan   9561.539163       -2.438080  0.255730  -9.533802   
1648      DAT    635.822693       -7.209934  0.765272  -9.421407   
1953     Iswi    197.790872       -5.764627  0.612761  -9.407629   
46        Ald  17495.929955        1.279212  0.136365   9.380760   
2450   RpL37a   8730.533595        2.106822  0.226051   9.320111   
1697      Drs   3066.622796        6.027949  0.658498   9.154084   
2508    Saf-B    431.301379       -4.622846  0.512441  -9.021224   
1650      Dbi   2809.216590

## g) G386 vs mean:

In [73]:
genes_mean_G386 = pd.read_excel(PATH_MY_DGE + MEAN_G386)
genes_mean_G386.head()

Unnamed: 0,gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,40421,164.82008,-1.456397,0.48925,-2.976798,0.002912756,0.01353779
1,40512,15.273559,4.668935,1.141927,4.088646,4.338994e-05,0.000398231
2,128up,166.372302,1.750998,0.490266,3.571525,0.0003549087,0.002341475
3,14-3-3zeta,58800.016238,-0.864278,0.126198,-6.848586,7.458331e-12,5.051776e-10
4,2mit,5687.472251,-0.836722,0.234015,-3.575499,0.0003495607,0.002315213


In [74]:
contingency_table_mean_G386 = compute_contingency_table(genes_mean_G386)
print(contingency_table_mean_G386)
odds_ratio_mean_G386 = compute_odds_ratio(contingency_table_mean_G386)
print("Odds ration for G386 vs mean:  %.2f" % odds_ratio_mean_G386)

               significant  non-significant
names                                      
cognition               59               78
non-cognition         2938            12607
Odds ration for G386 vs mean:  3.25


In [75]:
top_20_mean_G386, top_cognition_mean_G386, ratio_mean_G386 = top_list_genes(genes_mean_G386)
print("TOP 20 SIGNIFICANT: ", top_20_mean_G386, "\n")
print("TOP COGNITION: ", top_cognition_mean_G386, "\n")
print("Ratio cognition vs non-cognition in top 20:  %.2f" % ratio_mean_G386, "\n")

TOP 20 SIGNIFICANT:           gene      baseMean  log2FoldChange     lfcSE       stat  \
1640  CR43212     19.487941        6.850780  0.483138  14.179756   
2940    VGlut  25766.143689        2.786228  0.247098  11.275824   
2142     Lgr1    569.200952       -7.899626  0.731686 -10.796469   
1799      ear    164.206692       -7.755911  0.735536 -10.544572   
2916      Ubx   3622.856686        3.904110  0.376359  10.373360   
1295   CG6443    246.925448       -6.543448  0.647936 -10.098916   
1900   Gapdh1   2868.651287        1.827306  0.180687  10.113099   
2733     Slbp    123.900016       -7.193336  0.733765  -9.803325   
2187     Mef2   3490.889867       -2.439977  0.251824  -9.689213   
2140     levy   4351.126806        1.731675  0.178751   9.687640   
2383      Pdh   1599.665069        5.254179  0.543827   9.661499   
1050  CG42368   2160.347323        4.346994  0.455890   9.535178   
214   CG10543   1047.895148       -2.479450  0.263695  -9.402733   
1642  CR43214     14.767597

# Pairwise Wald test comparisons

In [45]:
genes_DAL_V2 = pd.read_excel(PATH_MY_DGE + PAIRWISE_DAL + "V2.xlsx", index_col=0)
genes_DAL_V3 = pd.read_excel(PATH_MY_DGE + PAIRWISE_DAL + "V3.xlsx", index_col=0)
genes_DAL_AB_KCs = pd.read_excel(PATH_MY_DGE + PAIRWISE_DAL + "AB_KCs.xlsx", index_col=0)
genes_DAL_G_KCs = pd.read_excel(PATH_MY_DGE + PAIRWISE_DAL + "G_KCs.xlsx", index_col=0)
genes_DAL_R27 = pd.read_excel(PATH_MY_DGE + PAIRWISE_DAL + "R27.xlsx", index_col=0)
genes_DAL_G386 = pd.read_excel(PATH_MY_DGE + PAIRWISE_DAL + "G386.xlsx", index_col=0)

In [21]:
genes_V2_DAL = pd.read_excel(PATH_MY_DGE + PAIRWISE_V2 + "DAL.xlsx", index_col=0)
genes_V2_V3 = pd.read_excel(PATH_MY_DGE + PAIRWISE_V2 + "V3.xlsx", index_col=0)
genes_V2_AB_KCs = pd.read_excel(PATH_MY_DGE + PAIRWISE_V2 + "AB_KCs.xlsx", index_col=0)
genes_V2_G_KCs = pd.read_excel(PATH_MY_DGE + PAIRWISE_V2 + "G_KCs.xlsx", index_col=0)
genes_V2_R27 = pd.read_excel(PATH_MY_DGE + PAIRWISE_V2 + "R27.xlsx", index_col=0)
genes_V2_G386 = pd.read_excel(PATH_MY_DGE + PAIRWISE_V2 + "G386.xlsx", index_col=0)

In [22]:
genes_V3_DAL = pd.read_excel(PATH_MY_DGE + PAIRWISE_V3 + "DAL.xlsx", index_col=0)
genes_V3_V2 = pd.read_excel(PATH_MY_DGE + PAIRWISE_V3 + "V2.xlsx", index_col=0)
genes_V3_AB_KCs = pd.read_excel(PATH_MY_DGE + PAIRWISE_V3 + "AB_KCs.xlsx", index_col=0)
genes_V3_G_KCs = pd.read_excel(PATH_MY_DGE + PAIRWISE_V3 + "G_KCs.xlsx", index_col=0)
genes_V3_R27 = pd.read_excel(PATH_MY_DGE + PAIRWISE_V3 + "R27.xlsx", index_col=0)
genes_V3_G386 = pd.read_excel(PATH_MY_DGE + PAIRWISE_V3 + "G386.xlsx", index_col=0)

In [23]:
genes_AB_KCs_DAL = pd.read_excel(PATH_MY_DGE + PAIRWISE_AB_KCs + "DAL.xlsx", index_col=0)
genes_AB_KCs_V2 = pd.read_excel(PATH_MY_DGE + PAIRWISE_AB_KCs + "V2.xlsx", index_col=0)
genes_AB_KCs_V3 = pd.read_excel(PATH_MY_DGE + PAIRWISE_AB_KCs + "V3.xlsx", index_col=0)
genes_AB_KCs_G_KCs = pd.read_excel(PATH_MY_DGE + PAIRWISE_AB_KCs + "G_KCs.xlsx", index_col=0)
genes_AB_KCs_R27 = pd.read_excel(PATH_MY_DGE + PAIRWISE_AB_KCs + "R27.xlsx", index_col=0)
genes_AB_KCs_G386 = pd.read_excel(PATH_MY_DGE + PAIRWISE_AB_KCs + "G386.xlsx", index_col=0)

In [24]:
genes_G_KCs_DAL = pd.read_excel(PATH_MY_DGE + PAIRWISE_G_KCs + "DAL.xlsx", index_col=0)
genes_G_KCs_V2 = pd.read_excel(PATH_MY_DGE + PAIRWISE_G_KCs + "V2.xlsx", index_col=0)
genes_G_KCs_V3 = pd.read_excel(PATH_MY_DGE + PAIRWISE_G_KCs + "V3.xlsx", index_col=0)
genes_G_KCs_AB_KCs = pd.read_excel(PATH_MY_DGE + PAIRWISE_G_KCs + "AB_KCs.xlsx", index_col=0)
genes_G_KCs_R27 = pd.read_excel(PATH_MY_DGE + PAIRWISE_G_KCs + "R27.xlsx", index_col=0)
genes_G_KCs_G386 = pd.read_excel(PATH_MY_DGE + PAIRWISE_G_KCs + "G386.xlsx", index_col=0)

In [25]:
genes_R27_DAL = pd.read_excel(PATH_MY_DGE + PAIRWISE_R27 + "DAL.xlsx", index_col=0)
genes_R27_V2 = pd.read_excel(PATH_MY_DGE + PAIRWISE_R27 + "V2.xlsx", index_col=0)
genes_R27_V3 = pd.read_excel(PATH_MY_DGE + PAIRWISE_R27 + "V3.xlsx", index_col=0)
genes_R27_AB_KCs = pd.read_excel(PATH_MY_DGE + PAIRWISE_R27 + "AB_KCs.xlsx", index_col=0)
genes_R27_G_KCs = pd.read_excel(PATH_MY_DGE + PAIRWISE_R27 + "G_KCs.xlsx", index_col=0)
genes_R27_G386 = pd.read_excel(PATH_MY_DGE + PAIRWISE_R27 + "G386.xlsx", index_col=0)

In [26]:
genes_G386_DAL = pd.read_excel(PATH_MY_DGE + PAIRWISE_G386 + "DAL.xlsx", index_col=0)
genes_G386_V2 = pd.read_excel(PATH_MY_DGE + PAIRWISE_G386 + "V2.xlsx", index_col=0)
genes_G386_V3 = pd.read_excel(PATH_MY_DGE + PAIRWISE_G386 + "V3.xlsx", index_col=0)
genes_G386_AB_KCs = pd.read_excel(PATH_MY_DGE + PAIRWISE_G386 + "AB_KCs.xlsx", index_col=0)
genes_G386_G_KCs = pd.read_excel(PATH_MY_DGE + PAIRWISE_G386 + "G_KCs.xlsx", index_col=0)
genes_G386_R27 = pd.read_excel(PATH_MY_DGE + PAIRWISE_G386 + "R27.xlsx", index_col=0)

In [76]:
DAL = pd.read_excel(DGE_PAPER, sheet_name="Supplemental Table 6", usecols="A:H", skiprows=7)
DAL.dropna(axis=0, inplace=True)
DAL.head

Unnamed: 0,Genes,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,qvalue
0,CG9795,391.861,4.59727,0.745981,6.16271,7.15e-10,2.41e-06,2.22644e-06
1,Iris,613.726,4.31574,0.698002,6.18299,6.29e-10,2.41e-06,2.22644e-06
2,CG14872,306.428,6.62435,1.19361,5.54983,2.86e-08,5.46e-05,5.04452e-05
3,EndoGI,324.007,6.94372,1.25609,5.52806,3.24e-08,5.46e-05,5.04452e-05
4,CG7488,166.63,7.87991,1.49139,5.2836,1.27e-07,0.000171002,0.000158186
...,...,...,...,...,...,...,...,...
7330,,,,,,,,
7331,,,,,,,,
7332,,,,,,,,
7333,,,,,,,,
