In [None]:
%run ../config/init.py

### Creating results folders

In [None]:
data_dir = os.path.join(RESULTS, DATASET, 'dga')
result_dir = working_dir(os.path.join(RESULTS, DATASET, 'go'))
sample_table_file = os.path.join(DATA, DATASET, 'sample_table.csv')
sample_table = pandas.read_csv(sample_table_file, keep_default_na=False)
sample_table.head()

### Creating comparisons
In this cell, an array with all combinations of **conditions** is created.  
 
If you just want to explore a set of comparisons remove this cell and add the **comparisons** list manually. 
```
comparisons = [
    ['cond1', 'cond2'],
    ['cond2', 'cond3']
]
``` 

In [None]:
comparisons = []
for s in itertools.combinations(sample_table['condition'].unique(), 2):
    comparisons.append(list(s)) 
comparisons

### Loading Gene2GO database from NCBI FTP
This database is created with the GO-enrichment python package: https://gitlab.com/r78v10a07/goenrichment

There are pre-computed database at: ftp://ftp.ncbi.nlm.nih.gov/pub/goenrichment

### Loading Gene2GO database for human

In [None]:
goenrichDB = "ftp://ftp.ncbi.nlm.nih.gov/pub/goenrichment/goenrichDB_{% cookiecutter.organism %}.pickle"
godb = load_goenrichdb(goenrichDB)
print('There are %d alternative ids in database' % (len(godb['alt_id'])))
print('There are %d GO terms' % (len(godb['graph'].nodes())))
print('There are %d genes in database' % (godb['M']))

### GO enrichment parameters

In [None]:
min_category_depth=4
min_category_size=3
max_category_size=500

### GO enrichment analysis

In [None]:
for c in comparisons:
    print('Processing %s vs %s' % (c[0], c[1]))
    dga_file = os.path.join(data_dir, 'condition_{0}_vs_{1}_union.csv'.format(c[0], c[1]))
    if os.path.exists(dga_file):
        dga_df = pandas.read_csv(dga_file)
        dga_df[['Gene_Id', 'Chr', 'start']] = dga_df['Gene_Id'].str.split('_', n=2, expand = True)
        query = set(dga_df[(dga_df['FDR'] <= fdr)&(dga_df['logFC'] >= fc)]['Gene_Id'].tolist())
        if len(query) > 0:
            print('\tProcessing {0} over expressed gene list'.format(len(query)))
            over_df = calculate(godb, query, fdr, min_category_depth, min_category_size, max_category_size)
            over_df = over_df[over_df['q'] <= fdr]
        else:
            over_df = pandas.DataFrame(columns=['term', 'name', 'namespace', 'depth', 'k', 'n', 'p', 'q', '-1.0log(q)', 'accepted'])
        
        query = set(dga_df[(dga_df['FDR'] <= fdr)&(dga_df['logFC'] <= -1.0 * fc)]['Gene_Id'].tolist())
        if len(query) > 0:
            print('\tProcessing {0} under expressed gene list'.format(len(query)))
            under_df = calculate(godb, query, fdr, min_category_depth, min_category_size, max_category_size)
            under_df = under_df[under_df['q'] <= fdr]
        else:
            under_df = pandas.DataFrame(columns=['term', 'name', 'namespace', 'depth', 'k', 'n', 'p', 'q', '-1.0log(q)', 'accepted'])
        
        if not over_df.empty and not under_df.empty:
            over_df = over_df[~over_df['term'].isin(under_df['term'])].sort_values('q')
            under_df = under_df[~under_df['term'].isin(over_df['term'])].sort_values('q')
            
        union_set = set().union(over_df['namespace'].unique(), under_df['namespace'].unique())
        if union_set:
            for d in union_set: 
                print('\tGO namespace: ' + d)
                df1 = over_df[over_df['namespace'] == d]
                file_name = 'go_over_{0}_{1}_vs_{2}_union.csv'.format(d, c[0], c[1])
                df1.to_csv(file_name, index=None)
                print('\t\tGO terms for genes over expressed: %d' % len(df1))
                df2 = under_df[under_df['namespace'] == d]
                file_name = 'go_under_{0}_{1}_vs_{2}_union.csv'.format(d, c[0], c[1])
                df2.to_csv(file_name, index=None)
                print('\t\tGO terms for genes under expressed: %d' % len(df2))
                
                if len(df1) > 25:
                    df1 = df1.head(25)
                if len(df2) > 25:
                    df2 = df2.head(25)
                df1 = df1.reset_index(drop=True)
                df1['color'] = 'red'    
                df2 = df2.reset_index(drop=True)
                df2['color'] = 'blue'
                df = pandas.concat([df1, df2])
                df = df.sort_values('q')
                
                plt.subplots(figsize=(20,18))
                objects = df['name'].tolist()
                x_pos = np.arange(len(objects))
                counts = df['-1.0log(q)'].tolist()
                colors = df['color'].tolist()
                plt.barh(x_pos, counts, align='center', color=colors)
                plt.yticks(x_pos, objects)
                plt.xlabel('-1.0log(q)')
                plt.title('Top 50 GO terms for: ' + d)
                red_patch = mpatches.Patch(color='red', label='GO term from over expressed genes')
                blue_patch = mpatches.Patch(color='blue', label='GO term from under expressed genes')
                plt.legend(handles=[red_patch, blue_patch], prop={'size': 22})
                file_name = 'go_{0}_{1}_vs_{2}_union.png'.format(d, c[0], c[1])
                plt.savefig(file_name, bbox_inches = "tight")
                plt.show()
        else:
            print('No differential GO terms were identified')