# Download OLIDA Excel files into jupyter notebook data folder
On OLIDA website, download Excel files of "Genes", "Diseases", and "Gene Combinations" and upload to a data folder in jupyter notebook 

# Import modules

In [1]:
import pandas as pd

# Get Excel data

In [2]:
# Get gene data
df_gene = pd.read_excel('../data/Gene.xlsx', skiprows=1)

# Get disease data
df_disease = pd.read_excel('../data/Disease.xlsx', skiprows=1)

# Get gene combo data
df_combo = pd.read_excel('../data/GeneCombination.xlsx', skiprows=1)

# Prepare dataframes

In [3]:
# Rename columns for exploding and merging
df_combo.rename(columns = {'Oligogenic variant combinations':'Combinations'}, inplace = True)
df_combo.rename(columns = {'Genes':'Partner Genes'}, inplace = True)
df_disease.rename(columns = {'Disease Name':'Diseases'}, inplace = True)

# Drop unwanted series
df_gene.drop(columns=['Ensembl Id', 'Entrez Id', 'Chromosome', 'Uniprot Accession Number', 'Gene Ontology Molecular Function', 'Essential In Mouse', 'Pathway', 'Variants', 'Diseases'], inplace=True)
df_disease.drop(columns = ['Orphanet ID', 'ICD-10 Category', 'Omim Id'], inplace=True)
df_combo.drop(columns=['Entry Id', 'Genes Relationship', 'Protein Interactions', 'Common Pathways', 'Common Pathways', 'GENEmeta'], inplace=True)

# Define function

In [4]:
# Make combos in series into lists then explode
def combo_explode(df):
    df['Combinations'] = df['Combinations'].str.replace(';', '').apply(str.split)
    newdf = df.explode('Combinations')
    return newdf.sort_values("Combinations")

 # Execute functions

In [5]:
df_gene = combo_explode(df_gene)
df_disease = combo_explode(df_disease)
df_combo = combo_explode(df_combo)

In [6]:
# Merge dataframes via "Combinations"
mergedf = df_gene.merge(df_disease, on=["Combinations"], how="left")
mergedf2 = mergedf.merge(df_combo, on="Combinations", how="left")

mergedf2

Unnamed: 0,Gene Name,Combinations,Diseases,ICD-10 ID,Partner Genes
0,CPO,OLI001,Hereditary coproporphyria,E80.2,ALAD; CPO
1,ALAD,OLI001,Hereditary coproporphyria,E80.2,ALAD; CPO
2,NOTCH3,OLI002,Idiopathic pulmonary arterial hypertension,I27.0,BMPR2; NOTCH3
3,BMPR2,OLI002,Idiopathic pulmonary arterial hypertension,I27.0,BMPR2; NOTCH3
4,NOTCH3,OLI003,Idiopathic pulmonary arterial hypertension,I27.0,BMPR2; NOTCH3
...,...,...,...,...,...
4003,AR,OLI998,"46,XY disorder of sex development",N.A.,AR; MYH6; NR5A1
4004,SRD5A2,OLI999,Androgen insensitivity syndrome,E34.5,AR; SRD5A2
4005,SRD5A2,OLI999,"46,XY disorder of sex development",N.A.,AR; SRD5A2
4006,AR,OLI999,Androgen insensitivity syndrome,E34.5,AR; SRD5A2


# Save to local file

In [7]:
mergedf2.to_csv('../data/olidaexcel.csv', index=False)