In [304]:
import pandas as pd
from collections import Counter
import csv
import requests
import random

In [274]:
## import Orthogroups with and without 70-15
og_no_70_15 = 'Orthogroups.tsv'
og_w_70_15 = 'Orthogroups_w_70_15.tsv'

In [275]:
og_no_70_15_df = pd.read_csv(og_no_70_15, dtype='string', sep='\t', index_col = 0)
og_w_70_15_df = pd.read_csv(og_w_70_15, dtype='string', sep='\t', index_col = 0)

In [276]:
df_lol = og_no_70_15_df.values.tolist()

og_dict_no_70_15 = {}

for i, row in enumerate(df_lol):
    og = og_no_70_15_df.index[i]
    for cell in row:
        if not pd.isnull(cell):
            for protein in cell.split(', '):
                og_dict_no_70_15[protein] = og

In [277]:
df_lol = og_w_70_15_df.values.tolist()

og_dict_w_70_15 = {}

for i, row in enumerate(df_lol):
    og = og_w_70_15_df.index[i]
    for cell in row:
        if not pd.isnull(cell):
            for protein in cell.split(', '):
                og_dict_w_70_15[protein] = og

In [278]:
genes_per_og_no_70_15 = {}

for gene in og_dict_no_70_15:
    og = og_dict_no_70_15[gene]
    if og not in genes_per_og_no_70_15:
        genes_per_og_no_70_15[og] = []
    genes_per_og_no_70_15[og].append(gene)

In [279]:
genes_per_og_w_70_15 = {}

for gene in og_dict_w_70_15:
    og = og_dict_w_70_15[gene]
    if og not in genes_per_og_w_70_15:
        genes_per_og_w_70_15[og] = []
    genes_per_og_w_70_15[og].append(gene)

In [280]:
## make translation dictionary
translation_w_70_15_to_no_70_15 = {}

for og in genes_per_og_w_70_15:
    proteins_in_og = genes_per_og_w_70_15[og]
    ogs_no_70_15 = []
    for protein in proteins_in_og:
        if 'MGG' not in protein:
            try:
                ogs_no_70_15.append(og_dict_no_70_15[protein])
            except: ## a few proteins don't end up in ogs
                pass
    if len(ogs_no_70_15) > 0:
        c = Counter(ogs_no_70_15)
        og_translated = c.most_common(1)[0][0]
        translation_w_70_15_to_no_70_15[og] = og_translated

In [318]:
## make translation dictionary
translation_no_70_15_to_w_70_15 = {}

for og in genes_per_og_no_70_15:
    proteins_in_og = genes_per_og_no_70_15[og]
    ogs_w_70_15 = []
    for protein in proteins_in_og:
        if 'MGG' not in protein:
            try:
                ogs_w_70_15.append(og_dict_w_70_15[protein])
            except: ## a few proteins don't end up in ogs
                pass
    # pick the majority here, not sure if this is actually the best way to do it
    if len(ogs_w_70_15) > 0:
        c = Counter(ogs_w_70_15)
        og_translated = c.most_common(1)[0][0]
        translation_no_70_15_to_w_70_15[og] = og_translated

In [153]:
mgg_translation_dict = {}
mggs_fasta = []

with open('Magnaporthe_oryzae.MG8.pep.all.fa') as file:
    file_reader = csv.reader(file, delimiter = '\t')
    for row in file_reader:
        if 'MGG_' in row[0]:
            mgg_translation = row[0][1:12]
            mggs_fasta.append(mgg_translation)
            uniprot_id = []
            uniprot_id = row[0].split(':')[-1][:-1]
            if ';' in uniprot_id: ## these ones just dont have a name
                continue
            if uniprot_id not in mgg_translation_dict:
                mgg_translation_dict[uniprot_id] = [mgg_translation]
            else:
                mgg_translation_dict[uniprot_id].append(mgg_translation)

In [154]:
need_to_look_up = []

with open('mg8_clust_out.tsv') as file:
    file_reader = csv.reader(file, delimiter = '\t')
    for row in file_reader:
        cluster_member = row[1].split('-')[1]
        try:
            cluster_member_translated = mgg_translation_dict[cluster_member]
        except:
            need_to_look_up.append(cluster_member)

In [155]:
len(need_to_look_up)

880

In [156]:
## try to look up some of the ones that werent found
not_found = []
for cluster_member in need_to_look_up:
    url = "https://rest.uniprot.org/uniprotkb/"+cluster_member+".fasta"
    response = requests.get(url)
    data = response.text.split(' ')
    cluster_member_translated = []
    for dat in data:
        if 'MGG_' in dat:
            cluster_member_translated = dat.split('=')[1]+'T0'
    if len(cluster_member_translated) > 0:
        if cluster_member_translated in mggs_fasta: ## make sure its actually in the proteome
            mgg_translation_dict[cluster_member] = [cluster_member_translated]
    else:
        not_found.append(cluster_member)

In [171]:
## what really matters is whether every gene has at least one uniprot id
mggs = []

for key in mgg_translation_dict:
    value = mgg_translation_dict[key]
    for gene in value:
        if gene not in mggs:
            mggs.append(gene)

In [172]:
len(mggs)

12755

In [173]:
len(mggs_fasta)

12755

In [174]:
for mgg in mggs:
    if mgg not in mggs_fasta:
        print(mgg)

In [170]:
# fill in the last 4 by hand
mgg_translation_dict['Q9P4R4'] = ['MGG_17041T0']
mgg_translation_dict['G5EH27'] = ['MGG_02612T0']
mgg_translation_dict['G5EGX3'] = ['MGG_03016T0']
mgg_translation_dict['G4N1P8'] = ['MGG_07514T0']

In [252]:
clusters_dict = {}

with open('mg8_clust_out_60.tsv') as file:
    file_reader = csv.reader(file, delimiter = '\t')
    for row in file_reader:
        cluster_name = row[0].split('-')[1]
        cluster_member = row[1].split('-')[1]
        try:
            cluster_name_translated = mgg_translation_dict[cluster_name][0]
        except KeyError:
            cluster_name_translated = cluster_name
        try:
            cluster_member_translated_list = mgg_translation_dict[cluster_member]
        except KeyError:
            cluster_member_translated_list = [cluster_member]
        if cluster_name_translated not in list(clusters_dict.keys()):
            clusters_dict[cluster_name_translated] = []
        for cluster_member_translated in cluster_member_translated_list:
            if cluster_member_translated not in clusters_dict[cluster_name_translated]: ## get rid of duplicates
                clusters_dict[cluster_name_translated].append(cluster_member_translated)

In [253]:
clusters_dict_final = {}

for cluster_number, cluster in enumerate(clusters_dict):
    cluster_list = clusters_dict[cluster]
    for protein in cluster_list:
        if 'MGG_' in protein:
            if cluster_number not in list(clusters_dict_final.keys()):
                clusters_dict_final[cluster_number] = []
            if protein not in clusters_dict_final[cluster_number]:
                clusters_dict_final[cluster_number].append(protein)

In [254]:
clusters_dict = clusters_dict_final

In [256]:
## check to see if any of the clusters substantially overlap with kyungyong's

ks_clusters = pd.read_csv('Magnaporthe_Oryza_Structure_prediction_and_clustering_metadata.zenodo.csv',skiprows=4, header=None)

In [257]:
ks_clusters_dict = {}
ks_clusters_membership_dict = {}

for cluster, gene in zip(ks_clusters[0],ks_clusters[2]):
    if cluster == cluster: # to deal with nas
        if int(cluster) not in ks_clusters_dict:
            ks_clusters_dict[int(cluster)] = []
        ks_clusters_dict[int(cluster)].append(gene)
        ks_clusters_membership_dict[gene] = [int(cluster)]

In [258]:
foldseek_clusters_to_ks_clusters = {}

for cluster in clusters_dict:
    foldseek_clusters_to_ks_clusters[cluster] = []
    for gene in clusters_dict[cluster]:
        try:
            gene_ks_cluster = ks_clusters_membership_dict[gene]
            foldseek_clusters_to_ks_clusters[cluster].append(gene_ks_cluster)
        except KeyError:
            pass

In [332]:
for cluster in foldseek_clusters_to_ks_clusters:
    if len(foldseek_clusters_to_ks_clusters[cluster]) > 0:
        print(cluster)
        print(foldseek_clusters_to_ks_clusters[cluster])

5
[[698]]
8
[[47], [47]]
9
[[300]]
20
[[67], [67]]
25
[[824]]
33
[[11]]
37
[[31]]
51
[[458]]
63
[[5], [5], [5], [5]]
70
[[11], [11], [11], [11], [11]]
79
[[601]]
81
[[499]]
82
[[8], [8]]
83
[[1], [1], [1], [1], [1], [1], [1], [1]]
93
[[790]]
94
[[538]]
99
[[711]]
103
[[348]]
115
[[502]]
122
[[8]]
124
[[159]]
134
[[384]]
150
[[751]]
165
[[876]]
166
[[27]]
167
[[134]]
168
[[5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [157], [5], [5], [5], [5]]
173
[[5]]
174
[[13], [13], [13], [13], [13], [13], [13], [13], [13], [13], [13], [13], [13], [13], [13], [13]]
176
[[479]]
199
[[12], [12], [12], [12], [12], [12]]
200
[[833]]
203
[[194]]
204
[[750]]
207
[[817]]
209
[[334]]
210
[[195]]
211
[[870]]
222
[[529]]
237
[[312]]
256
[[744]]
273
[[2], [2], [2]]
284
[[6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6]]
289
[[66], [66], [66]]
322
[[776]]
323
[[568]]
330
[[681]]
333
[[221]]
347
[[322]]
377
[[385]]
383
[[2], [2]]
385
[[80]]
386
[[80]]
407
[[520]]
426
[[278]]
428
[[152], [1

In [284]:
clusters_dict_70_15_ogs = {}

for cluster in clusters_dict:
    clusters_dict_70_15_ogs[cluster] = []
    for protein in clusters_dict[cluster]:
        try:
            og_dict_w_70_15[protein]
        except KeyError:
            ## doesnt have an og assigned
            continue
        if og_dict_w_70_15[protein] not in clusters_dict_70_15_ogs[cluster]:
            clusters_dict_70_15_ogs[cluster].append(og_dict_w_70_15[protein])

In [290]:
clusters_dict_gladieux_ogs = {}

for cluster in clusters_dict_70_15_ogs:
    clusters_dict_gladieux_ogs[cluster] = []
    for og in clusters_dict_70_15_ogs[cluster]:
        try:
            og_translated = translation_w_70_15_to_no_70_15[og]
        except KeyError:
            # og isnt in gladieux ogs
            continue
        if og_translated not in clusters_dict_gladieux_ogs[cluster]:
            clusters_dict_gladieux_ogs[cluster].append(og_translated)

In [313]:
clusters_dict_gladieux_ogs_flipped = {}

for cluster in clusters_dict_gladieux_ogs:
    for og in clusters_dict_gladieux_ogs[cluster]:
        clusters_dict_gladieux_ogs_flipped[og] = cluster

In [301]:
## some ogs appear in two different clusters...
ogs = []
ogs_duplicate_clusters = {}

for cluster in clusters_dict_gladieux_ogs:
    for og in clusters_dict_gladieux_ogs[cluster]:
        ogs.append(og)

c = Counter(ogs)

In [305]:
for key in c:
    if c[key] > 1:
        clusters = []
        # loop through the dict and grab all the clusters
        for cluster in clusters_dict_gladieux_ogs:
            for og in clusters_dict_gladieux_ogs[cluster]:
                if og == key:
                    clusters.append(cluster)
        # pick one cluster at random and remove the og from the rest
        choice = random.choice(clusters)
        for cluster in clusters:
            if cluster != choice:
                clusters_dict_gladieux_ogs[cluster].remove(key)

In [292]:
cluster_size_cutoff = 10
clusters_to_keep = []

max_length = 0

for cluster in clusters_dict_gladieux_ogs:
    if len(clusters_dict_gladieux_ogs[cluster]) >= cluster_size_cutoff:
        if len(clusters_dict_gladieux_ogs[cluster]) > max_length:
            max_length = len(clusters_dict_gladieux_ogs[cluster])
        clusters_to_keep.append(cluster)

print(len(clusters_to_keep))
print(max_length)

109
145


In [294]:
lineage_differentiating_ogs = []

with open('../pca_heat_map_phylogeny/lineage_differentiating_pavs.txt') as file:
    file_reader = csv.reader(file, delimiter = '\t')
    for row in file_reader:
        lineage_differentiating_ogs.append(row[0])

In [335]:
ogs_in_clusters = []

for cluster in clusters_dict_gladieux_ogs:
    for og in clusters_dict_gladieux_ogs[cluster]:
        # if og in ogs_in_clusters:
        #     print(og)
        # if og not in ogs_in_clusters:
        #     ogs_in_clusters.append(og)
        ogs_in_clusters.append(og)

pav_ogs_in_clusters = []

for og in lineage_differentiating_ogs:
    try:
        cluster = clusters_dict_gladieux_ogs_flipped[og]
        pav_ogs_in_clusters.append(og)
    except:
        continue

## number of ogs in clusters
total_number_of_ogs_in_clusters = len(ogs_in_clusters)
number_of_lineage_differentiating_ogs_in_clusters = len(pav_ogs_in_clusters)
ratio = number_of_lineage_differentiating_ogs_in_clusters/total_number_of_ogs_in_clusters
print(ratio)
print(len(pav_ogs_in_clusters))

0.02388311323405451
255


In [316]:
lineage_differentiating_ogs_clusters_count = {}

for og in lineage_differentiating_ogs:
    try:
        cluster = clusters_dict_gladieux_ogs_flipped[og]
    except:
        ## i guess some ogs aren't in clusters
        print(og)
        continue
    if cluster not in lineage_differentiating_ogs_clusters_count:
        lineage_differentiating_ogs_clusters_count[cluster] = 0
    lineage_differentiating_ogs_clusters_count[cluster] += 1

OG0000101
OG0000122
OG0000155
OG0000210
OG0000226
OG0000240
OG0000295
OG0000433
OG0010345
OG0010397
OG0010609
OG0010815
OG0010844
OG0010851
OG0010855
OG0010887
OG0010893
OG0010912
OG0010915
OG0010921
OG0010924
OG0010927
OG0010928
OG0010944
OG0010947
OG0010954
OG0010960
OG0010980
OG0010993
OG0011003
OG0011006
OG0011038
OG0011050
OG0011051
OG0011065
OG0011071
OG0011078
OG0011091
OG0011099
OG0011117
OG0011158
OG0011166
OG0011171
OG0011177
OG0011178
OG0011196
OG0011210
OG0011223
OG0011234
OG0011243
OG0011245
OG0011246
OG0011248
OG0011277
OG0011281
OG0011282
OG0011292
OG0011294
OG0011308
OG0011309
OG0011317
OG0011337
OG0011338
OG0011340
OG0011352
OG0011353
OG0011354
OG0011356
OG0011367
OG0011369
OG0011386
OG0011393
OG0011410
OG0011418
OG0011419
OG0011436
OG0011457
OG0011458
OG0011465
OG0011468
OG0011469
OG0011476
OG0011478
OG0011479
OG0011486
OG0011488
OG0011500
OG0011502
OG0011536
OG0011549
OG0011559
OG0011564
OG0011565
OG0011572
OG0011579
OG0011583
OG0011587
OG0011598
OG0011603
OG0011607


In [322]:
translated_og = translation_no_70_15_to_w_70_15['OG0013798']

['gene_10890_TN0050_4_NA', 'gene_10947_TN0057_4_NA']

In [331]:
observed_cutoff = 3

for cluster in lineage_differentiating_ogs_clusters_count:
    if lineage_differentiating_ogs_clusters_count[cluster] >= observed_cutoff:
        print(cluster)
        print(lineage_differentiating_ogs_clusters_count[cluster])
        print(len(clusters_dict_gladieux_ogs[cluster]))
        print(lineage_differentiating_ogs_clusters_count[cluster]/len(clusters_dict_gladieux_ogs[cluster]))

70
3
6
0.5
4985
5
144
0.034722222222222224
4482
3
119
0.025210084033613446
2579
3
14
0.21428571428571427
