In [130]:
import pandas as pd
from collections import Counter
import csv
import requests
import random
import pickle

In [170]:
## import Orthogroups with and without 70-15
og_no_70_15 = '../../pipeline_methods/Orthogroups.tsv'
og_w_70_15 = 'Orthogroups_w_70_15.tsv'

In [171]:
og_no_70_15_df = pd.read_csv(og_no_70_15, dtype='string', sep='\t', index_col = 0)
og_w_70_15_df = pd.read_csv(og_w_70_15, dtype='string', sep='\t', index_col = 0)

In [133]:
## to get the og that a gene belongs to for orthogrouping without 70_15
df_lol = og_no_70_15_df.values.tolist()

og_dict_no_70_15 = {}

for i, row in enumerate(df_lol):
    og = og_no_70_15_df.index[i]
    for cell in row:
        if not pd.isnull(cell):
            for protein in cell.split(', '):
                og_dict_no_70_15[protein] = og

In [134]:
## to get the og that a gene belongs to for orthogrouping with 70_15
df_lol = og_w_70_15_df.values.tolist()

og_dict_w_70_15 = {}

for i, row in enumerate(df_lol):
    og = og_w_70_15_df.index[i]
    for cell in row:
        if not pd.isnull(cell):
            for protein in cell.split(', '):
                og_dict_w_70_15[protein] = og

In [135]:
## to get all genes associated with one OG for orthogrouping without 70_15
genes_per_og_no_70_15 = {}

for gene in og_dict_no_70_15:
    og = og_dict_no_70_15[gene]
    if og not in genes_per_og_no_70_15:
        genes_per_og_no_70_15[og] = []
    genes_per_og_no_70_15[og].append(gene)

In [136]:
## to get all genes associated with one OG for orthogrouping with 70_15
genes_per_og_w_70_15 = {}

for gene in og_dict_w_70_15:
    og = og_dict_w_70_15[gene]
    if og not in genes_per_og_w_70_15:
        genes_per_og_w_70_15[og] = []
    genes_per_og_w_70_15[og].append(gene)

In [137]:
## to go from uniprot to MGGs

mgg_translation_dict = {}
mggs_fasta = []

with open('Magnaporthe_oryzae.MG8.pep.all.fa') as file:
    file_reader = csv.reader(file, delimiter = '\t')
    for row in file_reader:
        if 'MGG_' in row[0]:
            mgg_translation = row[0][1:12]
            mggs_fasta.append(mgg_translation)
            uniprot_id = []
            uniprot_id = row[0].split(':')[-1][:-1]
            if ';' in uniprot_id: ## these ones just dont have a name
                continue
            if uniprot_id not in mgg_translation_dict:
                mgg_translation_dict[uniprot_id] = [mgg_translation]
            else:
                mgg_translation_dict[uniprot_id].append(mgg_translation)

In [138]:
need_to_look_up = []

with open('mg8_clust_out.tsv') as file:
    file_reader = csv.reader(file, delimiter = '\t')
    for row in file_reader:
        cluster_member = row[1].split('-')[1]
        try:
            cluster_member_translated = mgg_translation_dict[cluster_member]
        except:
            need_to_look_up.append(cluster_member)

In [139]:
len(need_to_look_up)

880

In [140]:
## try to look up some of the ones that werent found
not_found = []
for cluster_member in need_to_look_up:
    url = "https://rest.uniprot.org/uniprotkb/"+cluster_member+".fasta"
    response = requests.get(url)
    data = response.text.split(' ')
    cluster_member_translated = []
    for dat in data:
        if 'MGG_' in dat:
            cluster_member_translated = dat.split('=')[1]+'T0'
    if len(cluster_member_translated) > 0:
        if cluster_member_translated in mggs_fasta: ## make sure its actually in the proteome
            mgg_translation_dict[cluster_member] = [cluster_member_translated]
    else:
        not_found.append(cluster_member)

In [141]:
## what really matters is whether every gene has at least one uniprot id
mggs = []

for key in mgg_translation_dict:
    value = mgg_translation_dict[key]
    for gene in value:
        if gene not in mggs:
            mggs.append(gene)

In [142]:
for mgg in mggs:
    if mgg not in mggs_fasta:
        print(mgg)

In [143]:
# fill in the last 4 by hand
mgg_translation_dict['Q9P4R4'] = ['MGG_17041T0']
mgg_translation_dict['G5EH27'] = ['MGG_02612T0']
mgg_translation_dict['G5EGX3'] = ['MGG_03016T0']
mgg_translation_dict['G4N1P8'] = ['MGG_07514T0']

In [144]:
## export dictionary to pickle so I dont have to redownload everything every time
pickle.dump(mgg_translation_dict, open("mgg_translation_dict.pkl", "wb"))

In [145]:
file = open("mgg_translation_dict.pkl", 'rb')
mgg_translation_dict = pickle.load(file)

In [146]:
## read in the clusters from alphafold+foldseek

clusters_dict = {}

with open('mg8_clust_out_40.tsv') as file:
    file_reader = csv.reader(file, delimiter = '\t')
    for row in file_reader:
        cluster_name = row[0].split('-')[1]
        cluster_member = row[1].split('-')[1]
        try:
            cluster_name_translated = mgg_translation_dict[cluster_name][0]
        except KeyError:
            cluster_name_translated = cluster_name
        try:
            cluster_member_translated_list = mgg_translation_dict[cluster_member]
        except KeyError:
            cluster_member_translated_list = [cluster_member]
        if cluster_name_translated not in list(clusters_dict.keys()):
            clusters_dict[cluster_name_translated] = []
        for cluster_member_translated in cluster_member_translated_list:
            if cluster_member_translated not in clusters_dict[cluster_name_translated]: ## get rid of duplicates
                clusters_dict[cluster_name_translated].append(cluster_member_translated)

In [147]:
clusters_dict_final = {}

for cluster_number, cluster in enumerate(clusters_dict):
    cluster_list = clusters_dict[cluster]
    for protein in cluster_list:
        if 'MGG_' in protein:
            if cluster_number not in list(clusters_dict_final.keys()):
                clusters_dict_final[cluster_number] = []
            if protein not in clusters_dict_final[cluster_number]:
                clusters_dict_final[cluster_number].append(protein)

In [148]:
clusters_dict = clusters_dict_final

In [149]:
## check to see if any of the clusters substantially overlap with kyungyong's

ks_clusters = pd.read_csv('Magnaporthe_Oryza_Structure_prediction_and_clustering_metadata.zenodo.csv',skiprows=4, header=None)

In [150]:
ks_clusters_dict = {}
ks_clusters_membership_dict = {}

for cluster, gene in zip(ks_clusters[0],ks_clusters[2]):
    if cluster == cluster: # to deal with nas
        if int(cluster) not in ks_clusters_dict:
            ks_clusters_dict[int(cluster)] = []
        ks_clusters_dict[int(cluster)].append(gene)
        ks_clusters_membership_dict[gene] = [int(cluster)]

In [151]:
foldseek_clusters_to_ks_clusters = {}

for cluster in clusters_dict:
    foldseek_clusters_to_ks_clusters[cluster] = []
    for gene in clusters_dict[cluster]:
        try:
            gene_ks_cluster = ks_clusters_membership_dict[gene]
            foldseek_clusters_to_ks_clusters[cluster].append(gene_ks_cluster)
        except KeyError:
            pass

In [152]:
for cluster in foldseek_clusters_to_ks_clusters:
    if len(foldseek_clusters_to_ks_clusters[cluster]) > 0:
        print(cluster)
        print(foldseek_clusters_to_ks_clusters[cluster])

4
[[303]]
5
[[262]]
7
[[82]]
11
[[516]]
16
[[600]]
17
[[302]]
18
[[581]]
23
[[391]]
27
[[571]]
28
[[537]]
29
[[840]]
36
[[478]]
37
[[70]]
41
[[218]]
42
[[540]]
45
[[629]]
48
[[511]]
53
[[500]]
55
[[249]]
71
[[272]]
73
[[305]]
96
[[20]]
97
[[110], [110]]
103
[[645]]
120
[[156], [156]]
122
[[474]]
127
[[513]]
134
[[402]]
159
[[11]]
160
[[276]]
162
[[7]]
166
[[615]]
174
[[233]]
175
[[62], [373], [62], [206], [62]]
193
[[772]]
196
[[358]]
233
[[656]]
234
[[56]]
235
[[36], [36], [36], [36], [36], [36], [99], [36], [465], [99]]
242
[[438]]
244
[[142], [142]]
246
[[884]]
257
[[20]]
260
[[565]]
263
[[742]]
264
[[234]]
267
[[25], [25], [25], [25], [25], [25], [25], [25], [25], [25]]
280
[[175]]
281
[[666]]
297
[[616]]
317
[[456]]
318
[[193]]
319
[[355]]
321
[[92]]
323
[[781]]
329
[[336]]
330
[[780]]
332
[[128], [128]]
334
[[382]]
336
[[115]]
337
[[7]]
343
[[828]]
346
[[168]]
348
[[409]]
353
[[650]]
357
[[343]]
362
[[14], [14]]
374
[[63], [63], [63]]
375
[[623], [841]]
381
[[4]]
384
[[165], [165

In [153]:
## still dealing with mggs up to here, now we want to assign them orthogroups
clusters_dict_70_15_ogs = {}

for cluster in clusters_dict:
    clusters_dict_70_15_ogs[cluster] = []
    for protein in clusters_dict[cluster]:
        try:
            og_dict_w_70_15[protein]
        except KeyError:
            ## doesnt have an og assigned
            continue
        if og_dict_w_70_15[protein] not in clusters_dict_70_15_ogs[cluster]:
            clusters_dict_70_15_ogs[cluster].append(og_dict_w_70_15[protein])

In [154]:
## and then translate the orthogroups over
clusters_dict_gladieux_ogs = {}

for cluster in clusters_dict_70_15_ogs:
    clusters_dict_gladieux_ogs[cluster] = []
    for og in clusters_dict_70_15_ogs[cluster]:        
        genes_in_og = genes_per_og_w_70_15[og]
        ogs_no_70_15 = [] ## this step means we dont need to make a 1:1 translation dictionary
        for gene in genes_in_og:
            ## this gets rid of MGGs, genes without ogs, and duplicate OGs all in one go
            if gene in og_dict_no_70_15:
                if og_dict_no_70_15[gene] not in ogs_no_70_15: 
                    ogs_no_70_15.append(og_dict_no_70_15[gene])
        for og_no_70_15 in ogs_no_70_15:
            if og_no_70_15 not in clusters_dict_gladieux_ogs[cluster]:
                clusters_dict_gladieux_ogs[cluster].append(og_no_70_15)

In [155]:
## some ogs appear in two different clusters...
ogs = []
ogs_duplicate_clusters = {}

for cluster in clusters_dict_gladieux_ogs:
    for og in clusters_dict_gladieux_ogs[cluster]:
        ogs.append(og)

c = Counter(ogs)

In [156]:
for key in c:
    if c[key] > 1:
        clusters = []
        # loop through the dict and grab all the clusters
        for cluster in clusters_dict_gladieux_ogs:
            for og in clusters_dict_gladieux_ogs[cluster]:
                if og == key:
                    clusters.append(cluster)
        # pick one cluster at random and remove the og from the rest
        choice = random.choice(clusters)
        for cluster in clusters:
            if cluster != choice:
                clusters_dict_gladieux_ogs[cluster].remove(key)

In [157]:
## from no_70_15 ogs, get cluster
clusters_dict_gladieux_ogs_flipped = {}

for cluster in clusters_dict_gladieux_ogs:
    for og in clusters_dict_gladieux_ogs[cluster]:
        clusters_dict_gladieux_ogs_flipped[og] = cluster

In [158]:
cluster_size_cutoff = 10
clusters_to_keep = []

max_length = 0

for cluster in clusters_dict_gladieux_ogs:
    if len(clusters_dict_gladieux_ogs[cluster]) >= cluster_size_cutoff:
        if len(clusters_dict_gladieux_ogs[cluster]) > max_length:
            max_length = len(clusters_dict_gladieux_ogs[cluster])
        clusters_to_keep.append(cluster)

print(len(clusters_to_keep))
print(max_length)

146
171


In [159]:
lineage_differentiating_ogs = []

with open('../pca_heat_map_phylogeny/lineage_differentiating_pavs.txt') as file:
    file_reader = csv.reader(file, delimiter = '\t')
    for row in file_reader:
        lineage_differentiating_ogs.append(row[0])

In [160]:
ogs_in_clusters = []

for cluster in clusters_dict_gladieux_ogs:
    for og in clusters_dict_gladieux_ogs[cluster]:
        # if og in ogs_in_clusters:
        #     print(og)
        # if og not in ogs_in_clusters:
        #     ogs_in_clusters.append(og)
        ogs_in_clusters.append(og)

pav_ogs_in_clusters = []

for og in lineage_differentiating_ogs:
    try:
        cluster = clusters_dict_gladieux_ogs_flipped[og]
        pav_ogs_in_clusters.append(og)
    except:
        continue

## number of ogs in clusters
total_number_of_ogs_in_clusters = len(ogs_in_clusters)
number_of_lineage_differentiating_ogs_in_clusters = len(pav_ogs_in_clusters)
ratio = number_of_lineage_differentiating_ogs_in_clusters/total_number_of_ogs_in_clusters
print(ratio)
print(len(pav_ogs_in_clusters))

0.023929706487193867
256


In [161]:
lineage_differentiating_ogs_clusters_count = {}

for og in lineage_differentiating_ogs:
    try:
        cluster = clusters_dict_gladieux_ogs_flipped[og]
    except:
        ## i guess some ogs aren't in clusters
        continue
    if cluster not in lineage_differentiating_ogs_clusters_count:
        lineage_differentiating_ogs_clusters_count[cluster] = 0
    lineage_differentiating_ogs_clusters_count[cluster] += 1

In [162]:
observed_cutoff = 1
print(ratio)

for cluster in lineage_differentiating_ogs_clusters_count:
    if lineage_differentiating_ogs_clusters_count[cluster] >= observed_cutoff:
        print(cluster)
        print(lineage_differentiating_ogs_clusters_count[cluster])
        print(len(clusters_dict_gladieux_ogs[cluster]))
        print(lineage_differentiating_ogs_clusters_count[cluster]/len(clusters_dict_gladieux_ogs[cluster]))

0.023929706487193867
572
1
3
0.3333333333333333
577
1
1
1.0
5631
1
6
0.16666666666666666
5550
1
63
0.015873015873015872
904
1
1
1.0
381
1
31
0.03225806451612903
898
2
12
0.16666666666666666
5632
2
29
0.06896551724137931
1346
2
23
0.08695652173913043
1162
1
1
1.0
4906
1
2
0.5
1161
1
1
1.0
2908
1
14
0.07142857142857142
1277
2
12
0.16666666666666666
3711
3
73
0.0410958904109589
503
1
11
0.09090909090909091
6451
2
4
0.5
980
1
9
0.1111111111111111
5393
1
6
0.16666666666666666
4552
1
1
1.0
364
1
2
0.5
5242
4
47
0.0851063829787234
4005
1
1
1.0
6290
1
93
0.010752688172043012
2960
1
1
1.0
2212
1
15
0.06666666666666667
3157
1
1
1.0
2061
4
30
0.13333333333333333
4693
1
6
0.16666666666666666
5095
3
6
0.5
5902
1
4
0.25
1376
1
2
0.5
340
5
171
0.029239766081871343
4585
1
11
0.09090909090909091
5740
1
7
0.14285714285714285
5248
2
15
0.13333333333333333
4514
3
124
0.024193548387096774
574
1
1
1.0
558
1
1
1.0
369
1
1
1.0
5622
1
1
1.0
5225
2
5
0.4
570
1
3
0.3333333333333333
569
1
1
1.0
557
1
1
1.0
560
1


In [163]:
ART_OGS_no_70_15 = [
'OG0009794',
'OG0000809',
'OG0010712',
'OG0010880',
'OG0010901',
'OG0010942',
'OG0010037',
'OG0009928',
'OG0010725',
'OG0008918',
'OG0010627',
'OG0009058',
'OG0011880',
'OG0003261',
'OG0008656',
'OG0010957',
'OG0008809',
'OG0010231',
'OG0009915',
'OG0011121',
'OG0009756',
'OG0009764',
'OG0010624',
'OG0009823',
'OG0010346',
'OG0009929',
'OG0010233',
'OG0011912',
'OG0009484',
'OG0010594',
'OG0012383'
]

ART_MGGS = [
'MGG_16829T0',
'MGG_08610T0',
'MGG_07994T0',
'MGG_01455T0',
'MGG_16321T0',
'MGG_09666T0',
'MGG_11627T0',
'MGG_16737T0',
'MGG_16977T0',
'MGG_16059T0',
'MGG_00230T0',
'MGG_09810T0',
'MGG_00390T0',
'MGG_16989T0',
'MGG_05127T0',
'MGG_05465T0',
'MGG_17635T0',
'MGG_08940T0',
'MGG_00380T0',
'MGG_17628T0',
'MGG_16323T0',
'MGG_15031T0',
'MGG_05410T0',
'MGG_15374T0',
'MGG_08818T0',
'MGG_01964T0',
'MGG_17336T0',
'MGG_07766T0',
'MGG_05795T0',
'MGG_00242T0',
'MGG_09628T0',
'MGG_17556T0'
]

ART_OGS_no_70_15 = [
'OG0009259',
'OG0010307',
'OG0000173',
'OG0000173',
'OG0010366',
'OG0010100',
'OG0000493',
'OG0010161',
'OG0008995',
'OG0000493',
'OG0008967',
'OG0000941',
'OG0010876',
'OG0009602',
'OG0009324',
'OG0012739',
'OG0008886',
'OG0009575',
'OG0009575',
'OG0009891',
'OG0009805',
'OG0012731',
'OG0012731',
'OG0007507',
'OG0007507'
]

no_70_15_pav_ogs_in_ks_arts_cluster = [
'OG0010712',
'OG0010942',
'OG0011880',
'OG0011912',
'OG0012383'
]

In [164]:
MAX_mggs = [
'MGG_18062T0',
'MGG_17249T0',
'MGG_16939T0',
'MGG_15207T0',
'MGG_16619T0',
'MGG_11967T0',
'MGG_15459T0',
'MGG_09425T0',
'MGG_15625T0',
'MGG_08992T0',
'MGG_08482T0',
'MGG_08469T0',
'MGG_14793T0',
'MGG_18060T0',
'MGG_17132T0',
'MGG_16475T0',
'MGG_17601T0',
'MGG_12426T0',
'MGG_17266T0',
'MGG_02207T0',
'MGG_16175T0',
'MGG_17255T0',
'MGG_11304T0',
'MGG_14600T0',
'MGG_15443T0',
'MGG_15911T0',
'MGG_10120T0',
'MGG_10004T0',
'MGG_15972T0',
'MGG_02635T0',
'MGG_18041T0',
'MGG_10282T0'
]

no_70_15_pav_ogs_in_ks_max_cluster = [
'OG0010298',
'OG0011350',
'OG0011463',
'OG0011481',
'OG0011512'
]

In [165]:
no_70_15_pav_ogs_in_ks_p450_cluster = [
'OG0001435',
'OG0001466',
'OG0010616',
'OG0010837',
'OG0011142',
'OG0011166',
'OG0011236',
'OG0011690'
]

In [166]:
for og in ART_OGS_no_70_15:
    for cluster in clusters_dict_70_15_ogs:
        if og in clusters_dict_70_15_ogs[cluster]:
            print(cluster)

423
5956
2992
2993
5956
2992
2993
5956
5956
5956
423
423
423
423
423
423
423
423
5956
423
337
423
423
442
2806
2806
2827
2806
2827
1860
1860


In [167]:
for og in no_70_15_pav_ogs_in_ks_arts_cluster:
    try:
        print(clusters_dict_gladieux_ogs_flipped[og])
    except:
        pass

3753
2061


In [168]:
for og in no_70_15_pav_ogs_in_ks_max_cluster:
    try:
        print(clusters_dict_gladieux_ogs_flipped[og])
    except:
        pass

3368
3970
420


In [169]:
for og in no_70_15_pav_ogs_in_ks_p450_cluster:
    try:
        print(clusters_dict_gladieux_ogs_flipped[og])
    except:
        pass

1964
5409
1872
1362
1527
