In [None]:
# Set working directory
import os
os.chdir('/home/grads/sjw6257/xDTD/xDTD_analysis')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

### Extract Drug-Disease Pair

In [None]:
df_2801 = pd.read_csv('KG2.8.0.1_DrugDiseaseScore.csv', dtype='unicode', low_memory=False)
df_2801 = df_2801[['drug_id','disease_id','disease_name']] #805,897 pairs

In [None]:
df_283 = pd.read_csv('KG2.8.3_DrugDiseaseScore.csv', dtype='unicode', low_memory=False)
df_283 = df_283[['drug_id','disease_id','disease_name']] #1,060,956 pairs

In [None]:
df_286 = pd.read_csv('KG2.8.6_DrugDiseaseScore.csv', dtype='unicode', low_memory=False)
df_286 = df_286[['drug_id','disease_id','disease_name']] #2,172,813 pairs

### 2801 intersect 283?

In [None]:
df_2801_intr_283 = pd.merge(df_2801,df_283, how='inner', on=['drug_id','disease_id'],suffixes=('_2801','_283')) 
df_2801_intr_283 # 805897 pairs

In [None]:
# How much of the intersect match in respect to each database

percent_match_1 = (len(df_2801_intr_283[['drug_id', 'disease_id']]) / len(df_2801[['drug_id', 'disease_id']])) * 100
percent_match_2 = len(df_2801_intr_283[['drug_id', 'disease_id']])/ len(df_283[['drug_id', 'disease_id']]) *100
print(f"\nPercentage of IDs in df_2801 that match: {percent_match_1:.2f}%")
print(f"\nPercentage of IDs in df_283 that match: {percent_match_2:.2f}%")

### 2801 intersect 286?

In [None]:
df_2801_intr_286 = pd.merge(df_2801,df_286, how='inner', on=['drug_id','disease_id'],suffixes=('_2801','_286')) 
df_2801_intr_286 # 64269 pairs

In [None]:
# How much of the intersect match in respect to each dataframes

percent_match_1 = (len(df_2801_intr_286[['drug_id', 'disease_id']]) / len(df_2801[['drug_id', 'disease_id']])) * 100
percent_match_2 = len(df_2801_intr_286[['drug_id', 'disease_id']])/ len(df_286[['drug_id', 'disease_id']]) *100
print(f"\nPercentage of IDs in df_2801 that match: {percent_match_1:.2f}%")
print(f"\nPercentage of IDs in df_286 that match: {percent_match_2:.2f}%")

### 283 intersect 286?

In [None]:
df_283_intr_286 = pd.merge(df_283,df_286, how='inner', on=['drug_id','disease_id'],suffixes=('_283','_286'))
df_283_intr_286 # 84278 pairs

In [None]:
# How much of the intersect match in respect to each dataframes

percent_match_1 = (len(df_283_intr_286[['drug_id', 'disease_id']]) / len(df_283[['drug_id', 'disease_id']])) * 100
percent_match_2 = len(df_283_intr_286[['drug_id', 'disease_id']])/ len(df_286[['drug_id', 'disease_id']]) *100
print(f"\nPercentage of IDs in df_283 that match: {percent_match_1:.2f}%")
print(f"\nPercentage of IDs in df_286 that match: {percent_match_2:.2f}%")

### Present in ALL three?

In [None]:
# Drug-Disease pairs present in ALL THREE KG2.8.0.1, KG2.8.3_refresh, and KG2.8.6
df = df_2801.merge(df_283, on=['drug_id', 'disease_id']).merge(df_286, on=['drug_id', 'disease_id'], suffixes=('_2801','_286','_283'))
df.columns = ['drug_id','disease_id', '2801', '283', '286'] # rename column

df_all = df[['drug_id','disease_id']]
df_all # 64269 pairs

In [None]:
dataframes = {'df_2801': df_2801, 'df_283': df_283, 'df_286': df_286}
for name, df in dataframes.items():
    percent_match = len(df_all) / len(df[['drug_id', 'disease_id']]) * 100
    print(f"\nPercentage of matching IDs in {name}: {percent_match:.2f}%")

### Venn Diagram 

In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn3

# Count length of each DB
A, B, C = len(df_2801), len(df_283), len(df_286)
AB, AC, BC, ABC = len(df_2801_intr_283), len(df_2801_intr_286), len(df_283_intr_286), len(df_all)

# Create the Venn diagram 
plt.figure(figsize=(8, 8))
venn_diagram = venn3(subsets=(A, B, AB, C, AC, BC, ABC), set_labels=('KG2.8.0.1', 'KG2.8.3', 'KG2.8.6'))

# Venn Diagram region lables
labels = {'100': A - AB - AC + ABC, '010': B - AB - BC + ABC, '001': C - AC - BC + ABC,
          '110': AB - ABC, '101': AC - ABC, '011': BC - ABC, '111': ABC}

for label, count in labels.items():
    venn_diagram.get_label_by_id(label).set_text(count)

plt.title("Three-Way Venn Diagram: Drug-Disease Pair")
plt.show()