In [73]:
import pandas as pd
from collections import Counter
import csv
import scipy.stats as stats

In [2]:
## import Orthogroups with and without 70-15
og_no_70_15 = '../../pipeline_methods/Orthogroups.tsv'
og_w_70_15 = 'Orthogroups_w_70_15.tsv'

In [3]:
og_no_70_15_df = pd.read_csv(og_no_70_15, dtype='string', sep='\t', index_col = 0)
og_w_70_15_df = pd.read_csv(og_w_70_15, dtype='string', sep='\t', index_col = 0)

In [4]:
## to get the og that a gene belongs to for orthogrouping without 70_15
df_lol = og_no_70_15_df.values.tolist()

og_dict_no_70_15 = {}

for i, row in enumerate(df_lol):
    og = og_no_70_15_df.index[i]
    for cell in row:
        if not pd.isnull(cell):
            for protein in cell.split(', '):
                og_dict_no_70_15[protein] = og

In [5]:
## to get the og that a gene belongs to for orthogrouping with 70_15
df_lol = og_w_70_15_df.values.tolist()

og_dict_w_70_15 = {}

for i, row in enumerate(df_lol):
    og = og_w_70_15_df.index[i]
    for cell in row:
        if not pd.isnull(cell):
            for protein in cell.split(', '):
                og_dict_w_70_15[protein] = og

In [6]:
## to get all genes associated with one OG for orthogrouping without 70_15
genes_per_og_no_70_15 = {}

for gene in og_dict_no_70_15:
    og = og_dict_no_70_15[gene]
    if og not in genes_per_og_no_70_15:
        genes_per_og_no_70_15[og] = []
    genes_per_og_no_70_15[og].append(gene)

In [7]:
## to get all genes associated with one OG for orthogrouping with 70_15
genes_per_og_w_70_15 = {}

for gene in og_dict_w_70_15:
    og = og_dict_w_70_15[gene]
    if og not in genes_per_og_w_70_15:
        genes_per_og_w_70_15[og] = []
    genes_per_og_w_70_15[og].append(gene)

In [14]:
df_ks_clusters = pd.read_excel('Magnaporthe_Oryza_Structure_prediction_and_clustering_metadata.zenodo.xlsx',skiprows=3)

In [15]:
df_ks_clusters = df_ks_clusters[['Cluster','geneID']]

In [17]:
ogs_w_70_15 = []

for gene in df_ks_clusters.geneID:
    if gene in og_dict_w_70_15:
        ogs_w_70_15.append(og_dict_w_70_15[gene])
    else:
        ogs_w_70_15.append('not found')

df_ks_clusters['ogs_w_70_15'] = ogs_w_70_15

In [159]:
ogs_per_cluster_no_70_15 = {}
already_seen_og = []

for idx in df_ks_clusters.index:
    cluster = df_ks_clusters.Cluster[idx]
    og = df_ks_clusters.ogs_w_70_15[idx]
    if og in genes_per_og_w_70_15:
        genes_in_og = genes_per_og_w_70_15[og]
    else:
        continue
    for gene in genes_in_og:
        if gene in og_dict_no_70_15:
            if cluster not in ogs_per_cluster_no_70_15:
                ogs_per_cluster_no_70_15[cluster] = []
            cluster_ogs = ogs_per_cluster_no_70_15[cluster]
            if og_dict_no_70_15[gene] not in cluster_ogs and og_dict_no_70_15[gene] not in already_seen_og:
                    cluster_ogs.append(og_dict_no_70_15[gene])
                    already_seen_og.append(og_dict_no_70_15[gene]) ## deal with ogs in two different clusters

In [161]:
clusters_per_og_no_70_15 = {}

for cluster in ogs_per_cluster_no_70_15:
    ogs_per_cluster = ogs_per_cluster_no_70_15[cluster]
    for og in ogs_per_cluster:
        if og not in clusters_per_og_no_70_15:
            clusters_per_og_no_70_15[og] = cluster
## some of them appear twice but its pretty inconsequential, just keep the most populated cluster

In [163]:
annotated_ogs = []

for cluster in ogs_per_cluster_no_70_15:
    for og in ogs_per_cluster_no_70_15[cluster]:
        if og not in annotated_ogs:
            annotated_ogs.append(og)

In [164]:
ogs_per_cluster_no_70_15_counted = {}

for cluster in ogs_per_cluster_no_70_15:
    ogs_per_cluster_no_70_15_counted[cluster] = len(ogs_per_cluster_no_70_15[cluster])

In [165]:
lineage_differentiating_ogs = []

with open('../pca_heat_map_phylogeny/lineage_differentiating_pavs.txt') as file:
    file_reader = csv.reader(file, delimiter = '\t')
    for row in file_reader:
        lineage_differentiating_ogs.append(row[0])

In [166]:
annotated_lineage_differentiating_ogs = []

for og in lineage_differentiating_ogs:
    if og in annotated_ogs:
        annotated_lineage_differentiating_ogs.append(og)

In [167]:
ratio = len(annotated_lineage_differentiating_ogs)/len(annotated_ogs)

In [168]:
ogs_per_cluster_expected = {}

for cluster in ogs_per_cluster_no_70_15_counted:
    ogs_per_cluster_expected[cluster] = round(ogs_per_cluster_no_70_15_counted[cluster] * ratio, 2)

In [169]:
ogs_per_cluster_observed = {}

for og in lineage_differentiating_ogs:
    if og in clusters_per_og_no_70_15:
        cluster = clusters_per_og_no_70_15[og]
        if cluster not in ogs_per_cluster_observed:
            ogs_per_cluster_observed[cluster] = 1
        else:
            ogs_per_cluster_observed[cluster] += 1

for cluster in ogs_per_cluster_expected:
    if cluster not in ogs_per_cluster_observed:
        ogs_per_cluster_observed[cluster] = 0

In [170]:
ogs_per_cluster_notobserved = {}

for og in clusters_per_og_no_70_15:
    if og not in lineage_differentiating_ogs:
        cluster = clusters_per_og_no_70_15[og]
        if cluster not in ogs_per_cluster_notobserved:
            ogs_per_cluster_notobserved[cluster] = 1
        else:
            ogs_per_cluster_notobserved[cluster] += 1

for cluster in ogs_per_cluster_no_70_15_counted:
    if cluster not in ogs_per_cluster_notobserved:
        ogs_per_cluster_notobserved[cluster] = 0

In [176]:
results_lol = []

for cluster in ogs_per_cluster_expected:
    if cluster < 100: ## just skip the smaller clusters
        observed = len(lineage_differentiating_ogs)
        not_observed = len(annotated_ogs)-observed
        annotated = ogs_per_cluster_no_70_15_counted[cluster]
        unannotated = len(annotated_ogs) - annotated
        observed_annotated = ogs_per_cluster_observed[cluster]
        observed_unannotated = observed-observed_annotated
        notobserved_annotated = ogs_per_cluster_notobserved[cluster]
        notobserved_unannotated = not_observed - notobserved_annotated
        table = [
            [observed_annotated, observed_unannotated],
            [notobserved_annotated, notobserved_unannotated]
        ]
        if (observed_annotated + observed_unannotated != observed or notobserved_annotated + notobserved_unannotated != not_observed or
            observed_annotated + notobserved_annotated != annotated or observed_unannotated + notobserved_unannotated != unannotated):
            print(cluster)
            print(observed)
            print(not_observed)
            print(annotated)
            print(unannotated)
            print(table[0])
            print(table[1])
            raise ValueError("sums arent right")
        p_value_fisher = stats.fisher_exact(table)[1]
        results_lol.append([cluster, ogs_per_cluster_no_70_15_counted[cluster], ogs_per_cluster_expected[cluster], ogs_per_cluster_observed[cluster], ogs_per_cluster_observed[cluster]/ogs_per_cluster_expected[cluster],p_value_fisher])

In [191]:
df = pd.DataFrame(results_lol, columns = ['cluster', 'annotated','expected','observed', 'ratio','p_value_fisher'])

In [192]:
df = df[df.observed >= 2]

In [193]:
df.head(100)

Unnamed: 0,cluster,annotated,expected,observed,ratio,p_value_fisher
1,2,69,2.66,2,0.75188,2.801322e-11
2,3,58,2.23,2,0.896861,3.907396e-09
3,4,43,1.66,2,1.204819,2.284747e-06
4,5,37,1.43,2,1.398601,1.774002e-05
7,8,18,0.69,2,2.898551,0.02670172
10,11,18,0.69,2,2.898551,0.02670172
14,15,15,0.58,3,5.172414,0.2811518


In [180]:
old_clust_ogs = ['OG0001435'
'OG0001466',
'OG0010616',
'OG0010837',
'OG0011142',
'OG0011166',
'OG0011236',
'OG0011690'
]

In [181]:
for og in old_clust_8_ogs:
    if og in lineage_differentiating_ogs:
        print(og)

OG0011166


In [186]:
for og in ogs_per_cluster_no_70_15[8]:
    if og in lineage_differentiating_ogs:
        print(og)

OG0011689
OG0013361
