In [3]:
import pandas as pd
from itertools import combinations

df_data = pd.read_csv('C:/Users/maliz/thesa/UKbiobank/data/iyac134_supplemental_table_s5.xlsx - Pralogs Data.csv')
df_data = df_data[df_data["Suggested mechanism"] == 'redundancy']
output_rows = []

for _, row in df_data.iterrows():
    main_gene = row['Tissue-specific gene'].strip()
    paralogs = [p.strip() for p in row['Paralogs names'].split(',')]
    correlations_raw = row['Paralogs correlations'].split(',')
    # Convert to floats, safely handling "NA"
    correlations = []
    for c in correlations_raw:
        c = c.strip()
        if c.upper() == 'NA' or c == '':
            correlations.append(None)
        else:
            correlations.append(float(c))
    original_paralogs = row['Paralogs names']

    for p, corr in zip(paralogs, correlations):
        if corr is not None and corr > 0.1:
            pair = tuple(sorted([main_gene, p]))
            output_rows.append({
                'symbol_1': pair[0],
                'symbol_2': pair[1],
                'Paralogs names': original_paralogs,
                'Correlation': corr
            })

# Remove duplicates
unique_pairs_df = pd.DataFrame(output_rows).drop_duplicates(subset=['symbol_1', 'symbol_2', 'Paralogs names'])

# unique_pairs_df.to_csv('gene_paralog_pairs_with_original.csv', index=False)

print('Done! Check gene_paralog_pairs_with_original.csv')

# Keep only standard chromosomes
valid_chromosomes = [str(i) for i in range(1, 23)] + ['X', 'Y']
df = unique_pairs_df.copy()


Done! Check gene_paralog_pairs_with_original.csv


In [2]:
import pandas as pd
from biomart import BiomartServer
import io

# Connect to BioMart
server = BiomartServer("http://www.ensembl.org/biomart")
dataset = server.datasets['hsapiens_gene_ensembl']
df = unique_pairs_df.copy()
# Gene list from your DataFrame
genes = list(pd.unique(df[['symbol_1', 'symbol_2']].values.ravel()))

# Perform the query
response = dataset.search({
    'filters': {
        'hgnc_symbol': genes
    },
    'attributes': [
        'ensembl_gene_id',
        'hgnc_symbol',
        'chromosome_name',
    ]
})

# ✅ response is a `requests.Response` object, so use .text
response_text = response.text  # ← this is the fix

# Parse with pandas
annotation_df = pd.read_csv(io.StringIO(response_text), sep='\t', header=None)

# Add column names
annotation_df.columns = [
    'gene', 'symbol', 'chr'
]


# Filter the annotation_df
annotation_df = annotation_df[annotation_df['chr'].isin(valid_chromosomes)]


In [3]:

# Correct mapping: symbol → (gene_id, chr)
symbol_to_gene_chr = {
    'ADSS':    ('ENSG00000267312', '17'),
    'ADSSL1':  ('ENSG00000185100', '14'),
    'GCNT6':   ('ENSG00000205318', '6'),
    'XYLT1':   ('ENSG00000103489', '16'),
    'GCNT7':   ('ENSG00000124091', '20'),
    'XYLT2':   ('ENSG00000015532', '17'),
    'CXorf23': ('ENSG00000173681', 'X'),
    'THRAP3':  ('ENSG00000054118', '1'),
    'MYCL1':   ('ENSG00000116990', '1'),
    'MYCN':    ('ENSG00000134323', '2'),
    'H1FX':    ('ENSG00000184897', '3'),
    'HIST1H1A':('ENSG00000124610', '6'),
    'HIST1H1B':('ENSG00000184357', '6'),
    'HIST1H1C':('ENSG00000187837', '6'),
    'HIST1H1D':('ENSG00000124575', '6'),
    'H1F0':    ('ENSG00000189060', '22'),
    'H2AFV':   ('ENSG00000105968', '7'),
    'H2AFZ':   ('ENSG00000164032', '4') ,
    'HIST1H1E': ("ENSG00000168298" ,"6") , 
    'HIST1H1T': ("ENSG00000187475" ,"6") ,
    'EPT1' : ('ENSG00000138018' ,"2")
}

# Get symbols already in your DataFrame
existing_symbols = set(annotation_df['symbol'])

# Add rows for missing symbols
new_rows = []
for symbol, (gene_id, chrom) in symbol_to_gene_chr.items():
    if symbol not in existing_symbols:
        new_rows.append({
            'gene': gene_id,
            'symbol': symbol,
            'chr': chrom
        })

# Add to DataFrame
new_df = pd.DataFrame(new_rows)
annotation_df = pd.concat([annotation_df, new_df], ignore_index=True)


In [4]:
# Count the occurrences of each value in the 'symbol' column
symbol_counts = annotation_df['symbol'].value_counts()

# Filter to get only values that appear more than once
symbols_more_than_once = symbol_counts[symbol_counts > 1]

# Get the number of such unique values
num_symbols_more_than_once = len(symbols_more_than_once)

print("Number of unique symbols appearing more than once:", num_symbols_more_than_once)


Number of unique symbols appearing more than once: 0


In [5]:
symbols_more_than_once

Series([], Name: count, dtype: int64)

In [6]:
# Merge for symbol_1
df = df.merge(
    annotation_df.rename(columns={
        'symbol': 'symbol_1',
        'gene': 'gene_1',
        'chr': 'chr_1'
    }),
    on='symbol_1',
    how='left'
)

# Merge for symbol_2
df = df.merge(
    annotation_df.rename(columns={
        'symbol': 'symbol_2',
        'gene': 'gene_2',
        'chr': 'chr_2'
    }),
    on='symbol_2',
    how='left'
)

df

Unnamed: 0,symbol_1,symbol_2,Paralogs names,Correlation,gene_1,chr_1,gene_2,chr_2
0,CCND2,CCND3,"CCND2, CCND1",0.092884,ENSG00000118971,12,ENSG00000112576,6
1,CCND1,CCND3,"CCND2, CCND1",0.266517,ENSG00000110092,11,ENSG00000112576,6
2,ATP1B1,ATP1B3,"ATP1B1, ATP1B4, ATP1B2, ATP4B",0.403591,ENSG00000143153,1,ENSG00000069849,3
3,ATP1B3,ATP1B4,"ATP1B1, ATP1B4, ATP1B2, ATP4B",-0.074083,ENSG00000069849,3,ENSG00000101892,X
4,ATP1B2,ATP1B3,"ATP1B1, ATP1B4, ATP1B2, ATP4B",0.033656,ENSG00000129244,17,ENSG00000069849,3
...,...,...,...,...,...,...,...,...
422,GCNT4,XYLT2,"GCNT1, GCNT2, GCNT4, GCNT6, GCNT7, GCNT3, XYLT1",-0.089296,ENSG00000176928,5,ENSG00000015532,17
423,GCNT6,XYLT2,"GCNT1, GCNT2, GCNT4, GCNT6, GCNT7, GCNT3, XYLT1",,ENSG00000205318,6,ENSG00000015532,17
424,GCNT7,XYLT2,"GCNT1, GCNT2, GCNT4, GCNT6, GCNT7, GCNT3, XYLT1",0.048416,ENSG00000124091,20,ENSG00000015532,17
425,GCNT3,XYLT2,"GCNT1, GCNT2, GCNT4, GCNT6, GCNT7, GCNT3, XYLT1",-0.066620,ENSG00000140297,15,ENSG00000015532,17


In [7]:
df = df[~((df.chr_1 == 'X') | (df.chr_2 == 'X') |   (df.chr_1 == 'Y') | (df.chr_2 == 'Y'))]

In [8]:
df = df[~(df.chr_1 == df.chr_2)]

In [9]:
df.shape

(363, 8)

In [10]:
# Create a new column with sorted pairs as tuples
df['pair'] = df.apply(lambda row: tuple(sorted([row['gene_1'], row['gene_2']])), axis=1)

# Check for duplicates
duplicates = df[df.duplicated('pair', keep=False)]

print(duplicates)

    symbol_1 symbol_2                                     Paralogs names  \
0      CCND2    CCND3                                       CCND2, CCND1   
1      CCND1    CCND3                                       CCND2, CCND1   
10      CDK4     CDK6                                               CDK4   
32       MYC     MYCN                                        MYCN, MYCL1   
44      TUBB   TUBB4B  TUBB4B, TUBB8, TUBB4A, TUBB2B, TUBB3, TUBB2A, ...   
77    BCL2L1     MCL1       BCL2, BOK, BCL2L1, BAX, BCL2A1, BAK1, BCL2L2   
98      TUBB   TUBB4B  TUBB, TUBB8, TUBB4A, TUBB2B, TUBB3, TUBB2A, TU...   
118   BCL2L1     MCL1         BCL2, MCL1, BOK, BAX, BCL2A1, BAK1, BCL2L2   
150    CCND1    CCND3                                       CCND3, CCND2   
151    CCND1    CCND2                                       CCND3, CCND2   
187     CDK4     CDK6                                               CDK6   
212    CCND2    CCND3                                       CCND3, CCND1   
213    CCND1

In [11]:
df = df.drop_duplicates(subset='pair').drop(columns='pair')


In [None]:
df.to_csv("paralogs.csv" , index = False)