<a href="https://colab.research.google.com/github/pierrelarmande/notebooks/blob/main/gene_gene_interaction_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import json
import csv
import pickle
from urllib.request import urlopen
import numpy as np
import missingno as msno
import re

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/Github/

Mounted at /content/gdrive
/content/gdrive/MyDrive/Github


In [3]:
# Connecting to the GIT repository
! git config --global user.email "plarmande@gmail.com"
! git config --global user.name "pierrelarmande"

username = 'pierrelarmande'
repo = 'gene-phenotype-NLP'
# To create a token : https://github.com/settings/tokens
git_token = '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'

url = "https://"+git_token+"@github.com/"+username+"/"+repo+".git"

In [5]:
# Clone if doesn't already exist
#!git clone {url}
%pwd
%cd $repo
#%cd $repo
#! git pull

/content/gdrive/MyDrive/Github/gene-phenotype-NLP/gene-phenotype-NLP


In [None]:
!cd data
!wget http://planttfdb.gao-lab.org/download/TF_list/Osj_TF_list.txt.gz
!gunzip Osj_TF_list.txt.gz
!ls

In [16]:
# IRIC data
data_filename = r'output/iric_data.pkl'
ids_filename = r'output/iric_identifiers.pkl'
gene_list = r'data/gene_list_final.txt'
gene_pair = r'data/3rep_union_network.txt'
TF_list = r'data/Osj_TF_list.txt'
iric = pd.read_pickle(data_filename)
iric_ids = pd.read_pickle(ids_filename)
gene_list = pd.read_csv(gene_list)
gene_pair =  pd.read_csv(gene_pair, names=['gene1','regulate','gene2'], header=None)
pd_TF_list = pd.read_csv(TF_list, sep="\t")

In [17]:
pd_TF_list.head()

Unnamed: 0,TF_ID,Gene_ID,Family
0,LOC_Os01g04750.1,LOC_Os01g04750,RAV
1,LOC_Os01g04800.1,LOC_Os01g04800,RAV
2,LOC_Os05g47650.1,LOC_Os05g47650,RAV
3,LOC_Os01g49830.1,LOC_Os01g49830,RAV
4,LOC_Os01g01290.1,LOC_Os01g01290,NF-YC


In [None]:
# creating a mapping from raprepname into iricname
mapping_rap_iric = {}

for idx, line in iric_ids.iterrows():
    if line['raprepName'] is None:
        continue

    if ',' in line['raprepName']:
        for rap in line['raprepName'].split(','):
            if rap in mapping_rap_iric:
                print(f"{rap} exist already and has as value : {mapping_rap_iric[rap]}")
            else:
                mapping_rap_iric[rap] = idx
    else:
        if line['raprepName'] in mapping_rap_iric:
            print(f"{line['raprepName']} exist already and has as value : {mapping_rap_iric[line['raprepName']]}")
        else:
            mapping_rap_iric[line['raprepName']] = idx

In [None]:
## Gene_Gene relations
from src.tools import stringdb_relations

df_gene_gene = stringdb_relations.get_gene_gene_relations()

In [None]:
df_gene_gene.head()

Unnamed: 0,gene1,gene2,neighborhood,fusion,cooccurence,coexpression,experimental,database,textmining,combined_score
0,Os01g0100100,Os01g0867700,0,0,0,0,0,279,48,284
1,Os01g0100100,Os03g0276500,0,0,0,0,46,153,0,157
2,Os01g0100100,Os03g0185500,0,0,0,0,0,0,408,408
3,Os01g0100100,Os03g0165800,0,0,0,81,45,211,47,251
4,Os01g0100100,Os03g0819900,0,0,0,47,205,356,247,583


In [None]:
gene_pair.shape

(3163, 3)

In [None]:
# first merging method with search in both directions

t1 = gene_pair.assign(pair=list(map(frozenset, zip(gene_pair.gene1, gene_pair.gene2))))
t2 = df_gene_gene.assign(pair=list(map(frozenset, zip(df_gene_gene.gene1, df_gene_gene.gene2))))


#t3 = t1.merge(t2, on='pair')

In [None]:
# second merging method but cannot remove real duplicates
#merge_1 = gene_pair.merge(df_gene_gene, left_on = ['gene1', 'gene2'], right_on= ['gene1', 'gene2'], how='left')
#merge_2 = gene_pair.merge(df_gene_gene, left_on = ['gene1', 'gene2'], right_on= ['gene2', 'gene1'], how='left')
#final_df = pd.concat([merge_1, merge_2]).drop_duplicates()


In [None]:
#final_df.head()

Unnamed: 0,gene1,regulate,gene2,neighborhood,fusion,cooccurence,coexpression,experimental,database,textmining,combined_score,gene1_x,gene2_x,gene1_y,gene2_y
0,Os01g0106700,1,Os01g0877500,0.0,0.0,0.0,72.0,0.0,0.0,304.0,326.0,,,,
1,Os01g0106700,1,Os03g0103300,,,,,,,,,,,,
2,Os01g0106700,1,Os03g0188400,,,,,,,,,,,,
3,Os01g0106700,1,Os03g0434800,,,,,,,,,,,,
4,Os01g0106700,1,Os03g0666100,,,,,,,,,,,,


In [None]:
#final_df2=final_df.drop_duplicates(subset=['gene1','gene2'])

In [None]:
#final_df3 = final_df2.fillna(0)

In [None]:
#final_df3['validated'] = [True if x != 0 else False for x in final_df3['combined_score']]

In [None]:
#final_df3.shape

(3164, 16)

In [None]:
#final_df3[final_df3['validated']==True].shape

(38, 16)

In [None]:
#t3.head()

Unnamed: 0,gene1_x,regulate,gene2_x,pair,gene1_y,gene2_y,neighborhood,fusion,cooccurence,coexpression,experimental,database,textmining,combined_score
0,Os01g0106700,1,Os01g0877500,"(Os01g0106700, Os01g0877500)",Os01g0106700,Os01g0877500,0,0,0,72,0,0,304,326
1,Os01g0106700,1,Os01g0877500,"(Os01g0106700, Os01g0877500)",Os01g0877500,Os01g0106700,0,0,0,72,0,0,304,326
2,Os01g0106700,1,Os05g0393100,"(Os05g0393100, Os01g0106700)",Os01g0106700,Os05g0393100,0,0,0,48,0,336,0,340
3,Os01g0106700,1,Os05g0393100,"(Os05g0393100, Os01g0106700)",Os05g0393100,Os01g0106700,0,0,0,48,0,336,0,340
4,Os01g0106700,1,Os12g0634900,"(Os01g0106700, Os12g0634900)",Os01g0106700,Os12g0634900,0,0,0,53,109,388,182,521


In [None]:
#t3_bis= t3.drop_duplicates(subset=['pair'])

In [None]:
t4 = t1.merge(t2, on='pair', how='left').drop_duplicates(subset=['pair'])

In [None]:
t4.head()

Unnamed: 0,gene1_x,regulate,gene2_x,pair,gene1_y,gene2_y,neighborhood,fusion,cooccurence,coexpression,experimental,database,textmining,combined_score
0,Os01g0106700,1,Os01g0877500,"(Os01g0106700, Os01g0877500)",Os01g0106700,Os01g0877500,0.0,0.0,0.0,72.0,0.0,0.0,304.0,326.0
2,Os01g0106700,1,Os03g0103300,"(Os01g0106700, Os03g0103300)",,,,,,,,,,
3,Os01g0106700,1,Os03g0188400,"(Os03g0188400, Os01g0106700)",,,,,,,,,,
4,Os01g0106700,1,Os03g0434800,"(Os03g0434800, Os01g0106700)",,,,,,,,,,
5,Os01g0106700,1,Os03g0666100,"(Os03g0666100, Os01g0106700)",,,,,,,,,,


In [None]:
t4 = t4.fillna(0)

In [None]:
t4.head()

Unnamed: 0,gene1_x,regulate,gene2_x,pair,gene1_y,gene2_y,neighborhood,fusion,cooccurence,coexpression,experimental,database,textmining,combined_score
0,Os01g0106700,1,Os01g0877500,"(Os01g0106700, Os01g0877500)",Os01g0106700,Os01g0877500,0.0,0.0,0.0,72.0,0.0,0.0,304.0,326.0
2,Os01g0106700,1,Os03g0103300,"(Os01g0106700, Os03g0103300)",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Os01g0106700,1,Os03g0188400,"(Os03g0188400, Os01g0106700)",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Os01g0106700,1,Os03g0434800,"(Os03g0434800, Os01g0106700)",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Os01g0106700,1,Os03g0666100,"(Os03g0666100, Os01g0106700)",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
t4['validated'] = [True if x != 0 else False for x in t4['combined_score']]

In [None]:
t4.shape

(3030, 15)

In [None]:
t4[t4['validated']==True].shape

(35, 15)

In [None]:
t4.head()

Unnamed: 0,gene1_x,regulate,gene2_x,pair,gene1_y,gene2_y,neighborhood,fusion,cooccurence,coexpression,experimental,database,textmining,combined_score,validated
0,Os01g0106700,1,Os01g0877500,"(Os01g0106700, Os01g0877500)",Os01g0106700,Os01g0877500,0.0,0.0,0.0,72.0,0.0,0.0,304.0,326.0,True
2,Os01g0106700,1,Os03g0103300,"(Os01g0106700, Os03g0103300)",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
3,Os01g0106700,1,Os03g0188400,"(Os03g0188400, Os01g0106700)",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
4,Os01g0106700,1,Os03g0434800,"(Os03g0434800, Os01g0106700)",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
5,Os01g0106700,1,Os03g0666100,"(Os03g0666100, Os01g0106700)",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


In [None]:
#gene_validated =  gene_pair.merge(df_gene_gene, on=['gene1','gene2'], how = 'left')

In [None]:
#gene_validated.head()

Unnamed: 0,gene1,regulate,gene2,neighborhood,fusion,cooccurence,coexpression,experimental,database,textmining,combined_score
0,Os01g0106700,1,Os01g0877500,0.0,0.0,0.0,72.0,0.0,0.0,304.0,326.0
1,Os01g0106700,1,Os03g0103300,,,,,,,,
2,Os01g0106700,1,Os03g0188400,,,,,,,,
3,Os01g0106700,1,Os03g0434800,,,,,,,,
4,Os01g0106700,1,Os03g0666100,,,,,,,,


In [None]:
#gene_validated = gene_validated.fillna(0)

In [None]:
#gene_validated.head()

Unnamed: 0,gene1,regulate,gene2,neighborhood,fusion,cooccurence,coexpression,experimental,database,textmining,combined_score
0,Os01g0106700,1,Os01g0877500,0.0,0.0,0.0,72.0,0.0,0.0,304.0,326.0
1,Os01g0106700,1,Os03g0103300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Os01g0106700,1,Os03g0188400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Os01g0106700,1,Os03g0434800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Os01g0106700,1,Os03g0666100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#gene_validated['validated'] = [True if x != 0 else False for x in gene_validated['combined_score']]

In [None]:
#gene_validated.shape

(3163, 12)

In [None]:
#gene_validated.loc[gene_validated['combined_score'] >= 500]

Unnamed: 0,gene1,regulate,gene2,neighborhood,fusion,cooccurence,coexpression,experimental,database,textmining,combined_score,validated
15,Os01g0106700,1,Os12g0634900,0.0,0.0,0.0,53.0,109.0,388.0,182.0,521.0,True
689,Os01g0910900,1,Os01g0797600,0.0,0.0,0.0,0.0,0.0,0.0,618.0,618.0,True
1287,Os03g0103300,1,Os05g0553400,0.0,0.0,0.0,0.0,0.0,0.0,640.0,640.0,True
2812,Os10g0465700,1,Os01g0715400,0.0,0.0,0.0,0.0,0.0,979.0,638.0,992.0,True


In [None]:
#gene_validated[gene_validated['validated']==True].shape

(37, 12)

In [None]:
t5 = t4.rename(columns={'gene1_x': 'gene1', 'gene2_x': 'gene2'})
t5.to_csv('output/3rep_union_network_validated.csv', columns = ['gene1','regulate','gene2','combined_score','validated'], index = False)

In [None]:
gene_pair = r'data/3rep_mean_network.txt'
gene_pair =  pd.read_csv(gene_pair, names=['gene1','regulate','gene2'], header=None)

In [None]:
gene_pair.shape

(1333, 3)

In [None]:
df1 = gene_pair.assign(pair=list(map(frozenset, zip(gene_pair.gene1, gene_pair.gene2))))
df2 = df_gene_gene.assign(pair=list(map(frozenset, zip(df_gene_gene.gene1, df_gene_gene.gene2))))


df3 = df1.merge(df2, on='pair', how='left').drop_duplicates(subset=['pair'])

In [None]:
df3 = df3.fillna(0)

In [None]:
df3['validated'] = [True if x != 0 else False for x in df3['combined_score']]

In [None]:
df3[df3['validated']==True].shape

(20, 15)

In [None]:
df4 = df3.rename(columns={'gene1_x': 'gene1', 'gene2_x': 'gene2'})
df4.to_csv('output/3rep_mean_network_validated.csv', columns = ['gene1','regulate','gene2','combined_score','validated'], index = False)

In [None]:
#gene_validated =  gene_pair.merge(df_gene_gene, on=['gene1','gene2'], how = 'left')

In [None]:
#gene_validated = gene_validated.fillna(0)

In [None]:
#gene_validated.head()

Unnamed: 0,gene1,regulate,gene2,neighborhood,fusion,cooccurence,coexpression,experimental,database,textmining,combined_score
0,Os12g0634900,1,Os07g0406800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Os12g0634900,1,Os03g0149300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Os12g0634900,1,Os02g0195500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Os12g0634900,1,Os01g0855200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Os12g0610600,1,Os01g0715400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#gene_validated['validated'] = [True if x != 0 else False for x in gene_validated['combined_score']]

In [None]:
#gene_validated[gene_validated['validated']==True].shape

(20, 12)

In [None]:
#gene_validated.to_csv('output/3rep_mean_network_validated.csv', columns = ['gene1','regulate','gene2','validated'], index = False)