In [3]:
import json
import pandas as pd

# Load the JSON file
with open("C:/Users/maliz/thesa/UKbiobank/data/response_1759850491173.json", "r") as f:
    data = json.load(f)

# Prepare a list to store rows
rows = []

for item in data.get("data", []):
    attr = item.get("attributes", {})
    scores = attr.get("scores", {})
    
    # Get genes from relationships if needed
    genes_data = item.get("relationships", {}).get("gene_combination", {}).get("data")
    genes = None
    if genes_data:
        genes = genes_data.get("id")  # can fetch actual gene list if you have another endpoint
    
    row = {
        "OLIDA_ID": item.get("id"),
        "genes_id": genes,
        "familial_evidence": scores.get("familial_evidence"), ## (1, 2, 3, )
        ## Genetic evidence - Statistical Manual Score (STATmanual) (1 or 2 ) 
        "statistical_evidence_manual": scores.get("statistical_evidence_manual"),
        "statistical_evidence_knowledge": scores.get("statistical_evidence_knowledge"),
        "statistical_evidence_metascore": scores.get("statistical_evidence_metascore"),
        ## Functional evidence - Gene combination Manual Score (GENEmanual) (0-3)
        "functional_gene_evidence_manual": scores.get("functional_gene_evidence_manual"),
        "functional_gene_evidence_manual_harmonized": scores.get("functional_gene_evidence_manual_harmonized"),
        "functional_gene_evidence_knowledge": scores.get("functional_gene_evidence_knowledge"),
        "functional_gene_evidence_metascore": scores.get("functional_gene_evidence_metascore"),
        "functional_variant_evidence_manual": scores.get("functional_variant_evidence_manual"),
        "functional_variant_evidence_knowledge": scores.get("functional_variant_evidence_knowledge"),
        "functional_variant_evidence_metascore": scores.get("functional_variant_evidence_metascore"),
        ### Functional evidence - The Aggregated Functional Manual Score (FUNmanual) (0-3)
        "functional_final_manual": scores.get("functional_final_manual"),
        "functional_final_metascore": scores.get("functional_final_metascore"),
        #The final manual curation confidence score (FINALmanual)
        "final_manual": scores.get("final_manual"),
        "final_metascore": scores.get("final_metascore"),
        "omim_id": ",".join(attr.get("omim_id", [])),
        "oligogenic_effect": attr.get("oligogenic_effect"),
        "status": attr.get("status"),
        "ethnicity": attr.get("ethnicity")
    }
    
    rows.append(row)

# Create DataFrame
df_olida_with_scores = pd.DataFrame(rows)


import pandas as pd
df = pd.read_csv("C:/Users/maliz/thesa/UKbiobank/data/GeneCombination.csv" , header = 1 )


# Step 1: find the max number of genes in any row
max_genes = df["Genes"].str.count(";").max() + 1
print("Maximum number of genes in a row:", max_genes)

# Step 2: split the 'Genes' column into separate columns
gene_cols = df["Genes"].str.split(";", expand=True)

# Step 3: rename the columns
gene_cols.columns = [f"symbol_{i+1}" for i in range(max_genes)]

# Step 4: combine back to the original df (optional)
df = pd.concat([df, gene_cols], axis=1)
df["num_genes"] = df["Genes"].str.count(";") + 1

# Count how many rows have each number of genes
gene_counts = df["num_genes"].value_counts().sort_index()

print(gene_counts)

df_two_genes = df[df["num_genes"] == 2].copy()
df_two_genes = df_two_genes[['Entry Id', 'Genes', 'Genes Relationship', 'Protein Interactions',
       'Common Pathways', 'GENEmeta', 'Oligogenic variant combinations','symbol_1', 'symbol_2']]


df_two_genes["num_olida"] = df_two_genes['Oligogenic variant combinations'].str.count(";") + 1

# 2Ô∏è‚É£ Find the maximum number of OLIDA IDs in a row
max_olida = df_two_genes["num_olida"].max()
print("Maximum OLIDA IDs in a row:", max_olida)

# # 3Ô∏è‚É£ Split OLIDA IDs into separate columns
olida_cols = df_two_genes["Oligogenic variant combinations"].str.split(";", expand=True)

# # Optional: rename columns as OLIDA_1, OLIDA_2, ...
olida_cols.columns = [f"OLIDA_{i+1}" for i in range(olida_cols.shape[1])]

# # 4Ô∏è‚É£ Combine with original DataFrame if needed
df_two_genes = pd.concat([df_two_genes, olida_cols], axis=1)


# 1Ô∏è‚É£ Count how many OLIDA IDs in each row
df_two_genes["num_olida"] = df_two_genes["Oligogenic variant combinations"].str.count(";") + 1

# 2Ô∏è‚É£ Count how many rows have each number of OLIDA
row_counts = df_two_genes["num_olida"].value_counts().sort_index()

print(row_counts)


df_two_genes_ = df_two_genes.copy()

# Suppose df_pairs has 37 OLIDA columns: OLIDA_1, OLIDA_2, ...
olida_cols = [col for col in df_two_genes_.columns if col.startswith("OLIDA_")]

# Create a dictionary for mapping OLIDA_ID ‚Üí total_score
score_map = dict(zip(df_olida_with_scores["OLIDA_ID"], df_olida_with_scores["final_manual"]))

# Add score columns
for col in olida_cols:
    df_two_genes_[col + "_score"] = df_two_genes_[col].astype(str).str.strip().map(score_map)


# Show first few rows
df_two_genes_.head()

df_two_genes_['symbol_1'] = df_two_genes_['symbol_1'].astype(str).str.strip()
df_two_genes_['symbol_2'] = df_two_genes_['symbol_2'].astype(str).str.strip()


Maximum number of genes in a row: 17
num_genes
2     881
3     252
4      78
5      42
6      11
7       4
8       6
9       1
10      1
14      2
17      1
Name: count, dtype: int64
Maximum OLIDA IDs in a row: 37
num_olida
1     691
2      94
3      41
4      17
5      14
6       8
7       1
8       5
9       1
10      1
11      3
13      2
15      1
20      1
37      1
Name: count, dtype: int64


In [15]:
import pandas as pd
from biomart import BiomartServer
import io
import time

def annotate_and_filter_diff_chr(df, batch_size=100, sleep_time=0.5):
    """
    Annotate genes with Ensembl and chromosome info using BioMart in batches,
    then keep only pairs where chr_1 != chr_2.
    
    Parameters
    ----------
    df_filtered_nonempty : pd.DataFrame
        Input DataFrame with columns ['symbol_1', 'symbol_2'].
    batch_size : int, optional
        Number of genes per BioMart query (default is 100).
    sleep_time : float, optional
        Pause between queries in seconds (default is 0.5).
    
    Returns
    -------
    pd.DataFrame
        Annotated and filtered DataFrame.
    """
    df_filtered_nonempty = df.dropna(axis=1, how='all')
    # Connect to BioMart
    print("üîó Connecting to BioMart...")
    server = BiomartServer("http://www.ensembl.org/biomart")
    dataset = server.datasets['hsapiens_gene_ensembl']

    # Extract unique gene symbols
    genes = list(pd.unique(df_filtered_nonempty[['symbol_1', 'symbol_2']].values.ravel()))
    print(f"üß¨ Found {len(genes)} unique genes to annotate")

    # Query BioMart in batches
    annotation_list = []
    for i in range(0, len(genes), batch_size):
        batch_genes = genes[i:i + batch_size]
        print(f"üîπ Querying genes {i + 1}‚Äì{i + len(batch_genes)}...")
        try:
            response = dataset.search({
                'filters': {'hgnc_symbol': batch_genes},
                'attributes': ['ensembl_gene_id', 'hgnc_symbol', 'chromosome_name']
            })
            response_text = response.text
            tmp_df = pd.read_csv(io.StringIO(response_text), sep='\t', header=None)
            annotation_list.append(tmp_df)
        except Exception as e:
            print(f"‚ö†Ô∏è Error on batch {i + 1}‚Äì{i + len(batch_genes)}: {e}")
        time.sleep(sleep_time)

    # Combine all batches
    if not annotation_list:
        raise ValueError("No annotations retrieved from BioMart.")
    annotation_df = pd.concat(annotation_list, ignore_index=True)
    annotation_df.columns = ['gene', 'symbol', 'chr']

    # Keep only valid chromosomes
    valid_chromosomes = [str(i) for i in range(1, 23)] + ['X', 'Y']
    annotation_df = annotation_df[annotation_df['chr'].isin(valid_chromosomes)]

    # Merge annotation for symbol_1
    df_annot = df_filtered_nonempty.merge(
        annotation_df.rename(columns={
            'symbol': 'symbol_1',
            'gene': 'gene_1',
            'chr': 'chr_1'
        }),
        on='symbol_1', how='left'
    )

    # Merge annotation for symbol_2
    df_annot = df_annot.merge(
        annotation_df.rename(columns={
            'symbol': 'symbol_2',
            'gene': 'gene_2',
            'chr': 'chr_2'
        }),
        on='symbol_2', how='left'
    )

    # Keep only pairs from different chromosomes
    df_annot = df_annot[df_annot['chr_1'] != df_annot['chr_2']]

    print(f"‚úÖ Finished: {df_annot.shape[0]} pairs remain after filtering.")
    return df_annot


## SCORE 1

In [11]:
import pandas as pd
import numpy as np

# Select OLIDA score columns
olida_score_cols = [col for col in df_two_genes_.columns if col.endswith('_score')]

# Keep rows with at least one = 1 and none > 1
df_filtered_score_1 = df_two_genes_[
    df_two_genes_[olida_score_cols].apply(
        lambda x: (x == 1).any() and not (x > 1).any(),
        axis=1
    )
]

print(df_filtered_score_1.shape)


(115, 84)


## SCORE 0

In [17]:

df_filtered_0 = df_two_genes_[
    df_two_genes_[olida_score_cols].apply(
        lambda x: ((x < 1) | (x.isna())).all(),
        axis=1
    )
]
df_filtered_0.shape
#

(684, 84)

In [16]:
df_annot = annotate_and_filter_diff_chr(df_filtered_0)
df_annot

üîó Connecting to BioMart...
üß¨ Found 604 unique genes to annotate
üîπ Querying genes 1‚Äì100...
üîπ Querying genes 101‚Äì200...
üîπ Querying genes 201‚Äì300...
üîπ Querying genes 301‚Äì400...
üîπ Querying genes 401‚Äì500...
üîπ Querying genes 501‚Äì600...
üîπ Querying genes 601‚Äì604...
‚úÖ Finished: 654 pairs remain after filtering.


Unnamed: 0,Entry Id,Genes,Genes Relationship,Protein Interactions,Common Pathways,GENEmeta,Oligogenic variant combinations,symbol_1,symbol_2,num_olida,...,OLIDA_10_score,OLIDA_11_score,OLIDA_12_score,OLIDA_13_score,OLIDA_14_score,OLIDA_15_score,gene_1,chr_1,gene_2,chr_2
0,2,BMPR2; NOTCH3,Involved in the same disease;Same pathway,N.A.,MicroRNAs in cancer;Signal Transduction,1,OLI002; OLI003,BMPR2,NOTCH3,2,...,,,,,,,ENSG00000204217,2,ENSG00000074181,19
1,3,ABCC8; NOTCH3,Involved in the same disease;Same pathway,N.A.,Disease,1,OLI004,ABCC8,NOTCH3,1,...,,,,,,,ENSG00000006071,11,ENSG00000074181,19
2,4,ABCC8; SARS2,Involved in the same disease,N.A.,N.A.,0,OLI005,ABCC8,SARS2,1,...,,,,,,,ENSG00000006071,11,ENSG00000104835,19
3,5,SMAD1; TBX4,Involved in the same disease,N.A.,N.A.,0,OLI006,SMAD1,TBX4,1,...,,,,,,,ENSG00000170365,4,ENSG00000121075,17
4,12,ANO5; COL6A2,Involved in the same disease;Relevant pathways...,N.A.,N.A.,1,OLI016,ANO5,COL6A2,1,...,,,,,,,ENSG00000171714,11,ENSG00000142173,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
680,1272,LMNA; TTN,Indirectly interacting,N.A.,Dilated cardiomyopathy;Hypertrophic cardiomyop...,1,OLI1797,LMNA,TTN,1,...,,,,,,,ENSG00000160789,1,ENSG00000155657,2
681,1273,FLNC; TTN,Involved in the same disease;Indirectly intera...,N.A.,N.A.,1,OLI1798,FLNC,TTN,1,...,,,,,,,ENSG00000128591,7,ENSG00000155657,2
682,1274,TNNT2; TTN,Involved in the same disease;Indirectly intera...,N.A.,Muscle contraction;Dilated cardiomyopathy;Hype...,1,OLI1799,TNNT2,TTN,1,...,,,,,,,ENSG00000118194,1,ENSG00000155657,2
683,1275,DSC2; TNNT2,N.A.,N.A.,N.A.,0,OLI1800,DSC2,TNNT2,1,...,,,,,,,ENSG00000134755,18,ENSG00000118194,1


## SCORE 2+3

In [12]:
import pandas as pd
import numpy as np

# Select OLIDA score columns
olida_score_cols = [col for col in df_two_genes_.columns if col.endswith('_score')]

# Keep rows with at least one = 1 and none > 1
df_filtered_2_3 = df_two_genes_[
    df_two_genes_[olida_score_cols].apply(
        lambda x: (x >1).any() ,
        axis=1
    )
]

df_filtered_2_3_nonempty = df_filtered_2_3.dropna(axis=1, how='all')


print(df_filtered_2_3.shape)

(82, 84)


In [195]:
df_annot[df_annot.gene_1.isnull()].symbol_1.unique()

array(['GBA', 'CCDC103'], dtype=object)

In [193]:
df_annot[df_annot.gene_2.isnull()].symbol_2.unique()

array([], dtype=object)

In [197]:
df_annot.loc[df_annot['symbol_1'] == 'GBA', ['gene_1', 'chr_1']] = ['ENSG00000105948', '7']
df_annot.loc[df_annot['symbol_1'] == 'CCDC103', ['gene_1', 'chr_1']] = ['ENSG00000167131', '17']
df_annot[['symbol_1','gene_1', 'chr_1' , 'symbol_2' , 'gene_2',	'chr_2']].to_csv("C:/Users/maliz/thesa/UKbiobank/data/groups/olida_pairs_filtered_by_score_0_original.csv", index = False)

In [13]:

pd.read_csv("C:/Users/maliz/thesa/UKbiobank/data/groups/olida_pairs_filtered_by_score_0.csv")

Unnamed: 0,symbol_1,gene_1,chr_1,symbol_2,gene_2,chr_2
0,BMPR2,ENSG00000204217,2,NOTCH3,ENSG00000074181,19
1,ABCC8,ENSG00000006071,11,NOTCH3,ENSG00000074181,19
2,ABCC8,ENSG00000006071,11,SARS2,ENSG00000104835,19
3,SMAD1,ENSG00000170365,4,TBX4,ENSG00000121075,17
4,ANO5,ENSG00000171714,11,COL6A2,ENSG00000142173,21
...,...,...,...,...,...,...
551,ELANE,ENSG00000197561,19,HAX1,ENSG00000143575,1
552,MYH7,ENSG00000092054,14,TPM1,ENSG00000140416,15
553,FLNC,ENSG00000128591,7,TTN,ENSG00000155657,2
554,TNNT2,ENSG00000118194,1,TTN,ENSG00000155657,2


In [9]:
82+684+115

881

In [157]:
df_filtered_nonempty.loc[df_filtered_nonempty['symbol_2'] == 'SLC9A3R1', ['gene_2', 'chr_2']] = ['ENSG00000109062', '17']


In [158]:
df_filtered_nonempty[df_filtered_nonempty.gene_1.isnull()]

Unnamed: 0,Entry Id,Genes,Genes Relationship,Protein Interactions,Common Pathways,GENEmeta,Oligogenic variant combinations,symbol_1,symbol_2,num_olida,...,OLIDA_8_score,OLIDA_9_score,OLIDA_10_score,OLIDA_11_score,OLIDA_12_score,OLIDA_13_score,gene_1,chr_1,gene_2,chr_2


In [159]:
df_filtered_nonempty.loc[df_filtered_nonempty['symbol_2'] == 'TTC26', ['gene_2', 'chr_2']] = ['ENSG00000105948', '7']


In [127]:
 df_filtered_nonempty[['symbol_1','gene_1', 'chr_1' , 'symbol_2' , 'gene_2',	'chr_2']].to_csv("C:/Users/maliz/thesa/UKbiobank/data/groups/olida_pairs_filtered_by_score_1_original.csv", index = False)

In [114]:
# Drop OLIDA columns that are all empty

# If you want, check which columns remain
df_filtered_2_3_nonempty.shape


(82, 84)

In [178]:
684+82+115

881

In [115]:
import pandas as pd
from biomart import BiomartServer
import io

# Connect to BioMart
server = BiomartServer("http://www.ensembl.org/biomart")
dataset = server.datasets['hsapiens_gene_ensembl']
# Gene list from your DataFrame
genes = list(pd.unique(df_filtered_2_3_nonempty[['symbol_1' ,'symbol_2']].values.ravel()))

# Perform the query
response = dataset.search({
    'filters': {
        'hgnc_symbol': genes
    },
    'attributes': [
        'ensembl_gene_id',
        'hgnc_symbol',
        'chromosome_name',
    ]
})

# ‚úÖ response is a `requests.Response` object, so use .text
response_text = response.text  # ‚Üê this is the fix

# Parse with pandas
annotation_df = pd.read_csv(io.StringIO(response_text), sep='\t', header=None)

# Add column names
annotation_df.columns = [
    'gene', 'symbol', 'chr'
]


# Filter the annotation_df
#annotation_df = annotation_df[annotation_df['chr'].isin(valid_chromosomes)]


In [116]:
valid_chromosomes = [str(i) for i in range(1, 23)] + ['X', 'Y']

annotation_df = annotation_df[annotation_df['chr'].isin(valid_chromosomes)]
annotation_df.shape

(148, 3)

In [117]:
# Merge for symbol_1
df_filtered_2_3_nonempty = df_filtered_2_3_nonempty.merge(
    annotation_df.rename(columns={
        'symbol': 'symbol_1',
        'gene': 'gene_1',
        'chr': 'chr_1'
    }),
    on='symbol_1',
    how='left'
)

# Merge for symbol_2
df_filtered_2_3_nonempty = df_filtered_2_3_nonempty.merge(
    annotation_df.rename(columns={
        'symbol': 'symbol_2',
        'gene': 'gene_2',
        'chr': 'chr_2'
    }),
    on='symbol_2',
    how='left'
)

df_filtered_2_3_nonempty

Unnamed: 0,Entry Id,Genes,Genes Relationship,Protein Interactions,Common Pathways,GENEmeta,Oligogenic variant combinations,symbol_1,symbol_2,num_olida,...,OLIDA_32_score,OLIDA_33_score,OLIDA_34_score,OLIDA_35_score,OLIDA_36_score,OLIDA_37_score,gene_1,chr_1,gene_2,chr_2
0,1,ALAD; CPO,Involved in the same disease;Same pathway;Rele...,N.A.,N.A.,2,OLI001,ALAD,CPO,1,...,,,,,,,ENSG00000148218,9,ENSG00000144410,2
1,6,ADD3; KAT2B,Affecting the same tissue;Relevant pathways fo...,N.A.,N.A.,3,OLI007,ADD3,KAT2B,1,...,,,,,,,ENSG00000148700,10,ENSG00000114166,3
2,7,SLC34A1; SLC34A3,Involved in the same disease;Affecting the sam...,N.A.,Sodium-coupled phosphate cotransporters;Transp...,3,OLI008; OLI1790; OLI1791,SLC34A1,SLC34A3,3,...,,,,,,,ENSG00000131183,5,ENSG00000198569,9
3,8,CYP1B1; TEK,Involved in the same disease;Directly interact...,N.A.,N.A.,3,OLI009; OLI010; OLI011; OLI012,CYP1B1,TEK,4,...,,,,,,,ENSG00000138061,2,ENSG00000120156,9
4,10,PROKR2; WDR11,Involved in the same disease;Relevant pathways...,N.A.,N.A.,1,OLI014; OLI1164,PROKR2,WDR11,2,...,,,,,,,ENSG00000101292,20,ENSG00000120008,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,1248,IFT172; TTC21B,Monogenic experiments only;Relevant pathways f...,N.A.,Organelle biogenesis and maintenance;Hedgehog ...,2,OLI1741,IFT172,TTC21B,1,...,,,,,,,ENSG00000138002,2,ENSG00000123607,2
78,1259,NLRP3; NOD2,Monogenic experiments only;Involved in the sam...,N.A.,NOD-like receptor signaling pathway;Immune Sys...,2,OLI1768; OLI1769; OLI1770; OLI1771; OLI1772,NLRP3,NOD2,5,...,,,,,,,ENSG00000162711,1,ENSG00000167207,16
79,1270,EFTUD2; EYA3,Involved in the same disease;Relevant pathways...,N.A.,N.A.,2,OLI1795,EFTUD2,EYA3,1,...,,,,,,,ENSG00000108883,17,ENSG00000158161,1
80,1276,AMHR2; GNRH1,Monogenic experiments only;Affecting the same ...,N.A.,Signal Transduction,2,OLI1801,AMHR2,GNRH1,1,...,,,,,,,ENSG00000135409,12,ENSG00000147437,8


In [118]:
df_filtered_2_3_nonempty[df_filtered_2_3_nonempty.gene_1.isnull()]

Unnamed: 0,Entry Id,Genes,Genes Relationship,Protein Interactions,Common Pathways,GENEmeta,Oligogenic variant combinations,symbol_1,symbol_2,num_olida,...,OLIDA_32_score,OLIDA_33_score,OLIDA_34_score,OLIDA_35_score,OLIDA_36_score,OLIDA_37_score,gene_1,chr_1,gene_2,chr_2
11,35,NOT_HGNC 4q35; SMCHD1,Involved in the same disease;Indirectly intera...,N.A.,N.A.,2,OLI051,NOT_HGNC 4q35,SMCHD1,1,...,,,,,,,,,ENSG00000101596,18


In [119]:
df_filtered_2_3_nonempty[df_filtered_2_3_nonempty.gene_2.isnull()]

Unnamed: 0,Entry Id,Genes,Genes Relationship,Protein Interactions,Common Pathways,GENEmeta,Oligogenic variant combinations,symbol_1,symbol_2,num_olida,...,OLIDA_32_score,OLIDA_33_score,OLIDA_34_score,OLIDA_35_score,OLIDA_36_score,OLIDA_37_score,gene_1,chr_1,gene_2,chr_2


In [120]:
df_filtered_2_3_nonempty = df_filtered_2_3_nonempty[~df_filtered_2_3_nonempty.gene_1.isnull()]

In [121]:
 df_filtered_2_3_nonempty[['symbol_1','gene_1', 'chr_1' , 'symbol_2' , 'gene_2',	'chr_2']].to_csv("C:/Users/maliz/thesa/UKbiobank/data/groups/olida_pairs_filtered_by_score_2_3_original.csv", index = False)

In [122]:
df_filtered_2_3_nonempty_tmp =  df_filtered_2_3_nonempty[['symbol_1','gene_1', 'chr_1' , 'symbol_2' , 'gene_2',	'chr_2']]

In [123]:
df_filtered_nonempty_tmp =  df_filtered_nonempty[['symbol_1','gene_1', 'chr_1' , 'symbol_2' , 'gene_2',	'chr_2']]

In [125]:
df_filtered_nonempty_tmp

Unnamed: 0,symbol_1,gene_1,chr_1,symbol_2,gene_2,chr_2
0,GJB2,ENSG00000165474,13,TMPRSS3,ENSG00000160183,21
1,ATP6V0A4,ENSG00000105929,7,ATP6V1B1,ENSG00000116039,2
2,RNF216,ENSG00000011275,7,SRA1,ENSG00000213523,5
3,FGFR1,ENSG00000077782,8,POLR3A,ENSG00000148606,10
4,SQSTM1,ENSG00000161011,5,TIA1,ENSG00000116001,2
...,...,...,...,...,...,...
110,COL6A3,ENSG00000163359,2,DMP1,ENSG00000152592,4
111,MYO7A,ENSG00000137474,11,SOX8,ENSG00000005513,16
112,SLC34A3,ENSG00000198569,9,SLC9A3R1,ENSG00000109062,17
113,LTBP2,ENSG00000119681,14,MYOC,ENSG00000034971,1


In [126]:
import pandas as pd

# Example: normalize gene pairs so (A, B) and (B, A) become the same
def normalize_pairs(df):
    df_normalized = df.copy()
    df_normalized[['gene_1', 'gene_2']] = pd.DataFrame(
        df[['gene_1', 'gene_2']].apply(lambda x: sorted(x), axis=1).tolist(),
        index=df.index
    )
    return df_normalized

# Normalize both DataFrames
df1_norm = normalize_pairs(df_filtered_nonempty_tmp)
df2_norm = normalize_pairs(df_filtered_2_3_nonempty_tmp)

# Find common pairs
common_pairs = pd.merge(df1_norm, df2_norm, on=['gene_1', 'gene_2'])

# If you only want the pair columns (without duplicates)
common_pairs = common_pairs[['gene_1', 'gene_2']].drop_duplicates()
common_pairs

Unnamed: 0,gene_1,gene_2


In [46]:
# version_with_all_couples.loc[version_with_all_couples['symbol_1'] == 'GBA', ['gene_1', 'chr_1']] = ['ENSG00000177628', '1']


In [128]:
df_filtered_nonempty_tmp.shape

(106, 6)

In [56]:
# version_with_all_couples.loc[version_with_all_couples['symbol_2'] == 'TTC26', ['gene_2', 'chr_2']] = ['ENSG00000105948', '7']
# version_with_all_couples.loc[version_with_all_couples['symbol_2'] == 'SLC9A3R1', ['gene_2', 'chr_2']] = ['ENSG00000109062', '17']



In [76]:
df_permuted[df_permuted.chr_1 ==df_permuted.chr_2 ]

Unnamed: 0.1,Unnamed: 0,symbol_1,symbol_2,gene_1,chr_1,gene_2,chr_2,symbol_pair,gene_pair


In [78]:
df_permuted

Unnamed: 0.1,Unnamed: 0,symbol_1,symbol_2,gene_1,chr_1,gene_2,chr_2,symbol_pair,gene_pair
0,0,ALAD,CPO,ENSG00000148218,9,ENSG00000132781,1,"frozenset({'ALAD', 'CPO'})","frozenset({'ENSG00000148218', 'ENSG00000144410'})"
1,1,BMPR2,NOTCH3,ENSG00000204217,2,ENSG00000107249,9,"frozenset({'NOTCH3', 'BMPR2'})","frozenset({'ENSG00000204217', 'ENSG00000074181'})"
2,2,ABCC8,NOTCH3,ENSG00000006071,11,ENSG00000198838,15,"frozenset({'NOTCH3', 'ABCC8'})","frozenset({'ENSG00000074181', 'ENSG00000006071'})"
3,3,ABCC8,SARS2,ENSG00000006071,11,ENSG00000132600,16,"frozenset({'ABCC8', 'SARS2'})","frozenset({'ENSG00000006071', 'ENSG00000104835'})"
4,4,SMAD1,TBX4,ENSG00000170365,4,ENSG00000184156,8,"frozenset({'SMAD1', 'TBX4'})","frozenset({'ENSG00000121075', 'ENSG00000170365'})"
...,...,...,...,...,...,...,...,...,...
703,875,TNNT2,TTN,ENSG00000118194,1,ENSG00000111319,12,"frozenset({'TNNT2', 'TTN'})","frozenset({'ENSG00000155657', 'ENSG00000118194'})"
704,876,DSC2,TNNT2,ENSG00000134755,18,ENSG00000164050,3,"frozenset({'DSC2', 'TNNT2'})","frozenset({'ENSG00000134755', 'ENSG00000118194'})"
705,877,AMHR2,GNRH1,ENSG00000135409,12,ENSG00000120948,1,"frozenset({'GNRH1', 'AMHR2'})","frozenset({'ENSG00000147437', 'ENSG00000135409'})"
706,878,MYH11,MYO5A,ENSG00000133392,16,ENSG00000137601,4,"frozenset({'MYH11', 'MYO5A'})","frozenset({'ENSG00000133392', 'ENSG00000197535'})"


In [198]:
import pandas as pd

df = pd.read_csv("C:/Users/maliz/thesa/UKbiobank/data/olida_result/olida_lof_0_results.csv")
df2 = pd.read_csv("C:/Users/maliz/thesa/UKbiobank/data/groups/olida_pairs_filtered_by_score_0_original.csv")

list_gene = df.gene_1.unique().tolist() + df.gene_2.unique().tolist()

tmp  = df2[(df2.gene_1.isin(list_gene)) & (df2.gene_2.isin(list_gene))]

In [199]:
tmp

Unnamed: 0,symbol_1,gene_1,chr_1,symbol_2,gene_2,chr_2
0,BMPR2,ENSG00000204217,2,NOTCH3,ENSG00000074181,19
1,ABCC8,ENSG00000006071,11,NOTCH3,ENSG00000074181,19
2,ABCC8,ENSG00000006071,11,SARS2,ENSG00000104835,19
3,SMAD1,ENSG00000170365,4,TBX4,ENSG00000121075,17
4,ANO5,ENSG00000171714,11,COL6A2,ENSG00000142173,21
...,...,...,...,...,...,...
647,ELANE,ENSG00000197561,19,HAX1,ENSG00000143575,1
648,MYH7,ENSG00000092054,14,TPM1,ENSG00000140416,15
650,FLNC,ENSG00000128591,7,TTN,ENSG00000155657,2
651,TNNT2,ENSG00000118194,1,TTN,ENSG00000155657,2


In [200]:
tmp.to_csv("C:/Users/maliz/thesa/UKbiobank/data/groups/olida_pairs_filtered_by_score_0.csv", index =False)

In [1]:
import pandas as pd

In [6]:
df_0 = pd.read_csv("C:/Users/maliz/thesa/UKbiobank/data/olida_result/olida_lof_0_results.csv")

In [7]:
df_0

Unnamed: 0,gene_1,gene_2,both_lof,lof_1,lof_2,expected_both_lof,p-value_lof
0,ENSG00000003393,ENSG00000142168,0.0,332,15,0.012928,0.987155
1,ENSG00000003393,ENSG00000147894,0.0,332,73,0.062916,0.939023
2,ENSG00000006071,ENSG00000074181,1.0,259,174,0.116990,0.993668
3,ENSG00000006071,ENSG00000104835,0.0,259,288,0.193638,0.823956
4,ENSG00000006283,ENSG00000184156,0.0,104,85,0.022948,0.977313
...,...,...,...,...,...,...,...
551,ENSG00000277893,ENSG00000101292,0.0,144,325,0.121491,0.885599
552,ENSG00000277893,ENSG00000112964,0.0,144,151,0.056447,0.945117
553,ENSG00000277893,ENSG00000136931,0.0,144,2,0.000748,0.999253
554,ENSG00000277893,ENSG00000144554,0.0,144,468,0.174947,0.839501


In [9]:
df_1 = pd.read_csv("C:/Users/maliz/thesa/UKbiobank/data/olida_result/olida_lof_full_version_1_results.csv")

In [11]:
df_1[['gene_1' , 'gene_2']]

Unnamed: 0,gene_1,gene_2
0,ENSG00000008710,ENSG00000170927
1,ENSG00000065618,ENSG00000054967
2,ENSG00000065618,ENSG00000110723
3,ENSG00000092529,ENSG00000175084
4,ENSG00000096996,ENSG00000115415
...,...,...
85,ENSG00000188910,ENSG00000165474
86,ENSG00000197535,ENSG00000100345
87,ENSG00000197912,ENSG00000141837
88,ENSG00000198838,ENSG00000189056


In [13]:
df_0[['gene_1' , 'gene_2']]

Unnamed: 0,gene_1,gene_2
0,ENSG00000003393,ENSG00000142168
1,ENSG00000003393,ENSG00000147894
2,ENSG00000006071,ENSG00000074181
3,ENSG00000006071,ENSG00000104835
4,ENSG00000006283,ENSG00000184156
...,...,...
551,ENSG00000277893,ENSG00000101292
552,ENSG00000277893,ENSG00000112964
553,ENSG00000277893,ENSG00000136931
554,ENSG00000277893,ENSG00000144554


In [15]:
import pandas as pd
import numpy as np
# Example: make sure both DataFrames have the same two columns
df_0 = df_0[['gene_1', 'gene_2']].copy()
df_1 = df_1[['gene_1', 'gene_2']].copy()

# Sort each pair so that (A, B) and (B, A) become identical
df_0[['gene_1_sorted', 'gene_2_sorted']] = pd.DataFrame(
    np.sort(df_0[['gene_1', 'gene_2']], axis=1)
)
df_1[['gene_1_sorted', 'gene_2_sorted']] = pd.DataFrame(
    np.sort(df_1[['gene_1', 'gene_2']], axis=1)
)

# Merge to find mutual (unordered) pairs
mutual_pairs = pd.merge(
    df_0[['gene_1_sorted', 'gene_2_sorted']],
    df_1[['gene_1_sorted', 'gene_2_sorted']],
    on=['gene_1_sorted', 'gene_2_sorted']
).drop_duplicates()

print(mutual_pairs)


Empty DataFrame
Columns: [gene_1_sorted, gene_2_sorted]
Index: []


In [16]:
df_2_3 = pd.read_csv("C:/Users/maliz/thesa/UKbiobank/data/olida_result/olida_lof_results.csv")

In [17]:
df_2_3.shape

(63, 7)

In [18]:
import pandas as pd
import numpy as np
# Example: make sure both DataFrames have the same two columns
df_0 = df_0[['gene_1', 'gene_2']].copy()
df_2_3 = df_2_3[['gene_1', 'gene_2']].copy()

# Sort each pair so that (A, B) and (B, A) become identical
df_0[['gene_1_sorted', 'gene_2_sorted']] = pd.DataFrame(
    np.sort(df_0[['gene_1', 'gene_2']], axis=1)
)
df_2_3[['gene_1_sorted', 'gene_2_sorted']] = pd.DataFrame(
    np.sort(df_2_3[['gene_1', 'gene_2']], axis=1)
)

# Merge to find mutual (unordered) pairs
mutual_pairs = pd.merge(
    df_0[['gene_1_sorted', 'gene_2_sorted']],
    df_2_3[['gene_1_sorted', 'gene_2_sorted']],
    on=['gene_1_sorted', 'gene_2_sorted']
).drop_duplicates()

print(mutual_pairs)


Empty DataFrame
Columns: [gene_1_sorted, gene_2_sorted]
Index: []


In [19]:
import pandas as pd
import numpy as np
# Example: make sure both DataFrames have the same two columns
df_1 = df_1[['gene_1', 'gene_2']].copy()
df_2_3 = df_2_3[['gene_1', 'gene_2']].copy()

# Sort each pair so that (A, B) and (B, A) become identical
df_1[['gene_1_sorted', 'gene_2_sorted']] = pd.DataFrame(
    np.sort(df_1[['gene_1', 'gene_2']], axis=1)
)
df_2_3[['gene_1_sorted', 'gene_2_sorted']] = pd.DataFrame(
    np.sort(df_2_3[['gene_1', 'gene_2']], axis=1)
)

# Merge to find mutual (unordered) pairs
mutual_pairs = pd.merge(
    df_1[['gene_1_sorted', 'gene_2_sorted']],
    df_2_3[['gene_1_sorted', 'gene_2_sorted']],
    on=['gene_1_sorted', 'gene_2_sorted']
).drop_duplicates()

print(mutual_pairs)


Empty DataFrame
Columns: [gene_1_sorted, gene_2_sorted]
Index: []
