### Goal

Test for overrepresentation of protein domains in genes associated with high number of CNEs (defined by counting CNEs in windows of various sizes around each gene).

### Input

- gene_IPR.tsv: protein domain information for all genes (generated with find_closest_gene.ipynb)
- all_IPR_desc.tsv: protein domain info (IDs and descriptions)
- all_species_nearby_cnes.pickle: Dictionary of nearby CNEs generated with find_nearby_cnes.ipynb.


### Output

- species_ID + '_nearby_cne_counts.tsv': dataframes of CNE counts around every gene for all distance thresholds
- species_ID + '_overrep_domains_' + str(dist) + '_kb.tsv' : dataframes of overrepresented protein domains and p-values (one file per species and distance threshold)
- multi_threshold_score.tsv : dataframe of protein domains most commonly overrrepresented across multiple distance thresholds and species

In [21]:
import sys
import scipy.stats as stats
import pandas as pd
import glob
import statistics
import collections
import pickle
import os

In [2]:
species_list = ['apis', 'aaeg', 'aamp', 'amel', 'alic', 'anas', 'cscu', 'cdip', 'csec', 'cfel', 'dmel', 
                'eaff', 'gocc', 'hazt', 'lpol', 'ahyp', 'obir', 'phum', 'pvan', 'ptri', 'smim', 
                'smar', 'tpal', 'tcas']

#### Load protein domain info

In [10]:
gene_IPR_file = "../../overrep_domains/new_parse_gff/avg/gene_IPR.tsv"
gene_IPR_df = pd.read_csv(gene_IPR_file, sep="\t")
gene_IPR_df

Unnamed: 0,species,gene_id,IPR_id
0,phum,gene-Phum_PHUM625480,no_IPR
1,phum,gene-Phum_PHUM625380,no_IPR
2,phum,gene-Phum_PHUM625280,IPR002110
3,phum,gene-Phum_PHUM625280,IPR020683
4,phum,gene-Phum_PHUM625280,IPR036770
...,...,...,...
1919966,cscu,gene-LOC111617813,IPR038441
1919967,cscu,gene-LOC111617813,IPR016177
1919968,cscu,gene-LOC111617814,IPR001878
1919969,cscu,gene-LOC111617812,IPR027806


In [3]:
all_IPRS_desc = pd.read_csv("../avg/all_IPR_desc.tsv", sep="\t")
all_IPRS_desc

Unnamed: 0,IPR_id,description
0,IPR003961,Fibronectin type III
1,,
2,IPR036116,Fibronectin type III superfamily
3,IPR003598,Immunoglobulin subtype 2
4,IPR007110,Immunoglobulin-like domain
...,...,...
13988,IPR036269,"Rho termination factor, N-terminal domain supe..."
13989,IPR011112,"Rho termination factor, N-terminal"
13990,IPR026104,Zinc finger C2HC domain-containing protein 1C
13991,IPR019757,"Peptidase S26A, signal peptidase I, lysine act..."


#### Load nearby CNE data (from find_nearby_CNEs.ipynb)

In [22]:
nearby_cne_file = 'all_species_nearby_cnes.pickle'

In [23]:
with open(nearby_cne_file, 'rb') as handle:
    nearby_cne_dict = pickle.load(handle)

### Create CNE count files

In [29]:
for species, threshold_dict in nearby_cne_dict.items():
    print(species)
    output_file_name = species + '_nearby_cne_counts.tsv'
    print("Creating output data frame")
    cne_counts_df = pd.DataFrame()
    for threshold, dist_dict in threshold_dict.items():
        print(threshold)
        column_name = 'cne_count_' + str(int(threshold/1000)) + '_kb'
        print(column_name)
        for gene_id, cne_list in dist_dict.items():
            cne_counts = len(cne_list)
            cne_counts_df.loc[gene_id, column_name] = cne_counts
        if column_name in cne_counts_df.columns:
            cne_counts_df[column_name] = cne_counts_df[column_name].astype(int)
    num_columns = len(cne_counts_df.columns)
    if num_columns > 0:
        cne_counts_df = cne_counts_df.sort_values(by=cne_counts_df.columns[0], ascending=False)
    cne_counts_df.index.name = 'gene_id'
    cne_counts_df.reset_index(inplace=True)
    print("Writing cne count data frame to file")
    cne_counts_df.to_csv(output_file_name, sep="\t")

apis
Creating output data frame
10000
cne_count_10_kb
25000
cne_count_25_kb
50000
cne_count_50_kb
75000
cne_count_75_kb
100000
cne_count_100_kb
200000
cne_count_200_kb
300000
cne_count_300_kb
400000
cne_count_400_kb
500000
cne_count_500_kb
there are  9 columns in data frame
Writing cne count data frame to file
aaeg
Creating output data frame
10000
cne_count_10_kb
25000
cne_count_25_kb
50000
cne_count_50_kb
75000
cne_count_75_kb
100000
cne_count_100_kb
200000
cne_count_200_kb
300000
cne_count_300_kb
400000
cne_count_400_kb
500000
cne_count_500_kb
there are  9 columns in data frame
Writing cne count data frame to file
aamp
Creating output data frame
10000
cne_count_10_kb
25000
cne_count_25_kb
50000
cne_count_50_kb
75000
cne_count_75_kb
100000
cne_count_100_kb
200000
cne_count_200_kb
300000
cne_count_300_kb
400000
cne_count_400_kb
500000
cne_count_500_kb
there are  9 columns in data frame
Writing cne count data frame to file
amel
Creating output data frame
10000
cne_count_10_kb
25000
cne_

### Run fisher test for all species

In [62]:
#### Distance thresholds
dist_list = [10, 25, 50, 75, 100, 200, 300 , 400 , 500] 

#### CNE count file

In [30]:
species = 'apis'
nearby_cne_df = pd.read_csv(species + '_nearby_cne_counts.tsv', sep="\t", index_col=0)
nearby_cne_df

Unnamed: 0,gene_id,cne_count_10_kb,cne_count_25_kb,cne_count_50_kb,cne_count_75_kb,cne_count_100_kb,cne_count_200_kb,cne_count_300_kb,cne_count_400_kb,cne_count_500_kb
0,gene-LOC100570391,4,4,4,4,4,4,4,5,5
1,gene-LOC100169440,4,4,4,4,4,4,4,4,4
2,gene-LOC100162333,4,4,4,4,4,4,4,4,4
3,gene-COX1,3,3,3,3,3,3,3,3,3
4,gene-LOC100167988,3,3,3,3,3,4,4,4,4
...,...,...,...,...,...,...,...,...,...,...
18278,gene-LOC100160914,0,0,0,0,0,0,0,0,0
18279,gene-Eif1ad,0,0,0,0,0,0,0,0,0
18280,gene-LOC103307715,0,0,0,0,0,0,0,0,0
18281,gene-LOC100573967,0,0,0,0,0,0,0,0,0


### Function that runs Fisher test

In [66]:
def fisher_test(merged_data, distance):
    fisher_results = {}
    pvalues = []
    column_name = "cne_count_" + str(distance) + '_kb'
    print(len(sp_df))
    print(len(merged_data))
    mean_cne = statistics.mean(list(sp_df[column_name]))
    stdev_cne =  statistics.stdev(list(sp_df[column_name]))
    cne_threshold = mean_cne # can also be run with avg + stdev or other threshold
    # Count genes with higher than avg CNEs
    with_CNE_df = merged_data[merged_data[column_name] > cne_threshold]
    without_CNE_df = merged_data[merged_data[column_name] <= cne_threshold]
    gene_count_wCNE = len(with_CNE_df['gene_id'].unique())
    gene_count_noCNE = len(without_CNE_df['gene_id'].unique())
    # Only test IPRs present in at least one gene with higher than average CNEs
    test_IPRs = list(with_CNE_df['IPR_id'].unique())
    print("Testing: ", len(test_IPRs), "distinct IPRs")
    print("cne threshold:", cne_threshold)
    for test_IPR in test_IPRs:
        # genes with high CNEs and containing protein domain
        gene_count_wCNE_wIPR = len(with_CNE_df[with_CNE_df['IPR_id'] == test_IPR]['gene_id'].unique())
        # genes with high CNEs not containing protein domain
        gene_count_wCNE_noIPR = gene_count_wCNE - gene_count_wCNE_wIPR
        # genes with low CNEs and containing protein domain
        gene_count_noCNE_wIPR = len(without_CNE_df[without_CNE_df['IPR_id'] == test_IPR]['gene_id'].unique())
        # genes with low CNEs not containing protein domain
        gene_count_noCNE_noIPR = gene_count_noCNE - gene_count_noCNE_wIPR
        # Make contingency table for Fisher test
        cont_matrix = [[gene_count_wCNE_wIPR, gene_count_wCNE_noIPR], [gene_count_noCNE_wIPR, gene_count_noCNE_noIPR]]
        # Run test
        oddsratio, pvalue = stats.fisher_exact(cont_matrix, alternative = 'greater')
        pvalues.append(pvalue)
    # Bonferroni pvalue corrrection
    padj = [pval * len(pvalues) for pval in pvalues]
    table = {'IPR_id':test_IPRs, 'pvalue':pvalues, 'corrected pvalue':padj}
    pval_df = pd.DataFrame(table)
    pval_df = pval_df.sort_values(by = ['corrected pvalue'])
    pval_df = pval_df.reset_index(drop = True)
    # Only keep domains with pvalue <= 0.05
    pval_df = pval_df[pval_df['corrected pvalue'] <= 0.05]
    pval_df = pval_df
    return(pval_df, cne_threshold)

In [72]:
out_dir = 'avg/'
os.mkdir(out_dir)
for species in species_list:
    nearby_cne_df = pd.read_csv(species + '_nearby_cne_counts.tsv', sep="\t", index_col=0)
    sp_df = nearby_cne_df.groupby('gene_id').sum().reset_index() # no redundant IPRs or CNE counts
    ### Combine nearby CNE info with protein domain info
    merged_data = gene_IPR_df[gene_IPR_df['species'] == species].merge(sp_df, how='left') # one row per IPR
    print("Running Fisher tests")
    for dist in dist_list:
        print("Distance:", dist)
        fisher_results, cne_threshold = fisher_test(merged_data, dist)
        fisher_results = fisher_results.merge(all_IPRS_desc, how='left')
        print("Writing output to file")
        filepath = out_dir + species + '_overrep_domains_' + str(dist) + '_kb.tsv'
        fisher_results.to_csv(filepath, sep = '\t', index = False)

Running Fisher tests
Distance: 10
18283
88452
Testing:  922 distinct IPRs
cne threshold: 0.04085762730405294
Writing output to file
Distance: 25
18283
88452
Testing:  1420 distinct IPRs
cne threshold: 0.0777771700486791
Writing output to file
Distance: 50
18283
88452
Testing:  2210 distinct IPRs
cne threshold: 0.14084121861838866
Writing output to file
Distance: 75
18283
88452
Testing:  2845 distinct IPRs
cne threshold: 0.20018596510419515
Writing output to file
Distance: 100
18283
88452
Testing:  3371 distinct IPRs
cne threshold: 0.26171853634523873
Writing output to file
Distance: 200
18283
88452
Testing:  4785 distinct IPRs
cne threshold: 0.5044029973199147
Writing output to file
Distance: 300
18283
88452
Testing:  5765 distinct IPRs
cne threshold: 0.7373516381337855
Writing output to file
Distance: 400
18283
88452
Testing:  6526 distinct IPRs
cne threshold: 0.9596893288847563
Writing output to file
Distance: 500
18283
88452
Testing:  4427 distinct IPRs
cne threshold: 1.180003281737

Writing output to file
Running Fisher tests
Distance: 10
13170
101339
Testing:  2372 distinct IPRs
cne threshold: 0.15056947608200455
Writing output to file
Distance: 25
13170
101339
Testing:  3037 distinct IPRs
cne threshold: 0.22399392558845863
Writing output to file
Distance: 50
13170
101339
Testing:  3902 distinct IPRs
cne threshold: 0.34001518602885344
Writing output to file
Distance: 75
13170
101339
Testing:  4610 distinct IPRs
cne threshold: 0.4463933181473045
Writing output to file
Distance: 100
13170
101339
Testing:  5083 distinct IPRs
cne threshold: 0.5405466970387244
Writing output to file
Distance: 200
13170
101339
Testing:  6344 distinct IPRs
cne threshold: 0.8706909643128322
Writing output to file
Distance: 300
13170
101339
Testing:  4607 distinct IPRs
cne threshold: 1.1363705391040242
Writing output to file
Distance: 400
13170
101339
Testing:  5183 distinct IPRs
cne threshold: 1.3504935459377372
Writing output to file
Distance: 500
13170
101339
Testing:  5603 distinct IP

Writing output to file
Running Fisher tests
Distance: 10
11927
85012
Testing:  2520 distinct IPRs
cne threshold: 1.2672926972415528
Writing output to file
Distance: 25
11927
85012
Testing:  3635 distinct IPRs
cne threshold: 2.580196193510522
Writing output to file
Distance: 50
11927
85012
Testing:  3907 distinct IPRs
cne threshold: 5.228053995137084
Writing output to file
Distance: 75
11927
85012
Testing:  4137 distinct IPRs
cne threshold: 8.282049132221012
Writing output to file
Distance: 100
11927
85012
Testing:  4483 distinct IPRs
cne threshold: 11.597384086526368
Writing output to file
Distance: 200
11927
85012
Testing:  4620 distinct IPRs
cne threshold: 26.577597048713002
Writing output to file
Distance: 300
11927
85012
Testing:  4892 distinct IPRs
cne threshold: 42.275928565439756
Writing output to file
Distance: 400
11927
85012
Testing:  5091 distinct IPRs
cne threshold: 58.69908610715184
Writing output to file
Distance: 500
11927
85012
Testing:  5095 distinct IPRs
cne threshold

Writing output to file


### Combine results

In [73]:
out_dir = 'avg/'
for dist in dist_list:
    thresh = str(dist) + '_kb'
    print(thresh)
    overrep_domain_files = glob.glob(out_dir + "*_overrep_domains_" + thresh + ".tsv")
    all_species_df = pd.DataFrame()
    for file in overrep_domain_files:
        species = file.split("_")[0]
        species_df = pd.read_csv(file, sep="\t")
        species_df['species'] = species
        all_species_df = all_species_df.append(species_df)
    counter=collections.Counter(list(all_species_df['IPR_id']))
    combined_results = pd.DataFrame(counter.most_common(), columns=['IPR_id', 'number_of_species'])
    combined_results = combined_results.merge(all_species_df[['description', 'IPR_id']], how='left', on='IPR_id').\
        drop_duplicates()
    cols = ['IPR_id', 'description', 'number_of_species']
    combined_results = combined_results[cols]
    combined_results.to_csv("all_species_overrep_domains_" + thresh + ".tsv", sep="\t", index=False)

10_kb


  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(speci

25_kb


  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(speci

50_kb


  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(speci

75_kb


  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(speci

100_kb


  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(speci

200_kb


  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(speci

300_kb
400_kb


  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(speci

500_kb


  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)
  all_species_df = all_species_df.append(species_df)


In [74]:
combined_results

Unnamed: 0,IPR_id,description,number_of_species
0,IPR017970,"Homeobox, conserved site",5
5,IPR001356,Homeobox domain,5
10,IPR009072,Histone-fold,4
14,IPR007125,Histone H2A/H2B/H3,4
18,IPR012464,Protein of unknown function DUF1676,4
22,IPR009057,Homeobox-like domain superfamily,4
26,IPR000558,Histone H2B,3
29,IPR002119,Histone H2A,3
32,IPR032454,"Histone H2A, C-terminal domain",3
35,IPR032458,Histone H2A conserved site,3


### Identify protein domains most commonly overrepresented (over multiple distance thresholds and number of species)

In [75]:
from collections import defaultdict
IPR_count_dict = defaultdict(lambda:0)
all_sp_overrep_files = glob.glob("all_species_overrep_domains_*.tsv")
print(all_sp_overrep_files)
for file in all_sp_overrep_files:
    sp_count_df = pd.read_csv(file, sep="\t")
    for index, row in sp_count_df.iterrows():
        IPR_id = row['IPR_id']
        num_sp = row['number_of_species']
        IPR_count_dict[IPR_id] += num_sp
all_thresh_counts = pd.DataFrame(IPR_count_dict.items(), columns=['IPR_id', 'cumulative_score'])
all_thresh_counts = all_thresh_counts.merge(all_IPRS_desc, how='left')
all_thresh_counts = all_thresh_counts.sort_values('cumulative_score', ascending=False)
all_thresh_counts.to_csv(out_dir + "multi_threshold_score.tsv", sep="\t")
all_thresh_counts

['all_species_overrep_domains_400_kb.tsv', 'all_species_overrep_domains_500_kb.tsv', 'all_species_overrep_domains_25_kb.tsv', 'all_species_overrep_domains_300_kb.tsv', 'all_species_overrep_domains_200_kb.tsv', 'all_species_overrep_domains_50_kb.tsv', 'all_species_overrep_domains_10_kb.tsv', 'all_species_overrep_domains_75_kb.tsv', 'all_species_overrep_domains_100_kb.tsv']


Unnamed: 0,IPR_id,cumulative_score,description
1,IPR001356,40,Homeobox domain
0,IPR017970,39,"Homeobox, conserved site"
2,IPR009057,38,Homeobox-like domain superfamily
3,IPR009072,28,Histone-fold
4,IPR007125,27,Histone H2A/H2B/H3
...,...,...,...
139,IPR014753,1,"Arrestin, N-terminal"
138,IPR000698,1,Arrestin
15,IPR001163,1,"LSM domain, eukaryotic/archaea-type"
136,IPR015897,1,CHK kinase-like
