In [5]:
###### Calculate Structure based properties using AlphaFold Structures (requires PDB file)

import csv
from Bio.PDB import PDBParser, DSSP
from Bio.PDB.Polypeptide import is_aa

# load your structure
p = PDBParser(QUIET=True)
structure = p.get_structure("X", "your_structure.pdb")

model = structure[0]  # first model in PDB

# run DSSP
dssp = DSSP(model, "../medha/AF-P04637-F1-model_v4.pdb")

# helper for polar/non-polar classification
nonpolar = set(["A","V","I","L","P","F","W","M"])
polar    = set(["G","S","T","Y","C","N","Q","H","K","R","D","E"])

rows = []

for key in dssp.keys():
    res_idx = key[1][1]      # numeric position
    aa      = dssp[key][1]   # one-letter residue name

    if not is_aa(aa):
        continue

    asa         = dssp[key][3]
    sec_str     = dssp[key][2]
    phi         = dssp[key][4]
    psi         = dssp[key][5]

    # approximate ASA % 
    # (DSSP originally outputs relative ASA normalized by Gly-X-Gly max)
    # Biopython stores this in dssp[key][3]
    asa_per     = dssp[key][3] / dssp[key][2] if dssp[key][2] else 0

    # hydrogen bonds
    hb_donor     = len(dssp[key][7])
    hb_acceptor  = len(dssp[key][8])

    # side-chain vs main-chain ASA
    sc_abs = dssp[key][3] - dssp[key][9]  # total ASA âˆ’ main chain ASA
    mc_abs = dssp[key][9]
    sc_real = sc_abs
    mc_real = mc_abs

    # polar vs nonpolar contributions (simple heuristic)
    if aa in nonpolar:
        nonpolar_abs  = asa
        polar_abs     = 0
    else:
        polar_abs     = asa
        nonpolar_abs  = 0

    polar_real     = polar_abs
    nonpolar_real  = nonpolar_abs

    # for contact count (approx simple count of atoms within cutoff)
    # if you want real contact count you can add a nearest neighbor search
    contact_count  = 0

    rows.append({
        "res": aa,
        "pos": res_idx,
        "ASA": asa,
        "SEC_STR": sec_str,
        "ASA_PER": asa_per,
        "contact_count": contact_count,
        "hb_donor": hb_donor,
        "hb_acceptor": hb_acceptor,
        "all_abs": asa,
        "all_real": asa,
        "sc_abs": sc_abs,
        "sc_real": sc_real,
        "mc_abs": mc_abs,
        "mc_real": mc_real,
        "Non_polar_abs": nonpolar_abs,
        "Non_polar_real": nonpolar_real,
        "polar_abs": polar_abs,
        "polar_real": polar_real,
        "ASA_PER_AVG": asa_per
    })

# write as CSV
with open("../tmp.csv", "w", newline="") as outf:
    writer = csv.DictWriter(outf, fieldnames=list(rows[0].keys()))
    writer.writeheader()
    writer.writerows(rows)

print("CSV written to dssp_output.csv")


[1;33mJupyter detected[0m[1;33m...[0m
[1;32m2[0m[1;32m channel Terms of Service accepted[0m
Retrieving notices: done
Channels:
 - defaults
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/medha/miniconda3

  added / updated specs:
    - keras


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    absl-py-2.3.1              |  py313hca03da5_0         433 KB
    conda-26.1.0               |  py313hca03da5_0         1.2 MB
    h5py-3.15.1                |  py313h2b875b3_1         1.1 MB
    hdf5-1.14.5                |       hd77251f_2         5.6 MB
    keras-3.13.2               |  py313h31cbfd0_0         3.7 MB
    ml_dtypes-0.5.4            |  py313h3f644e9_0         226 KB
    mpi-1.0                    |            mpich           5 KB
    mpi4py-4.0.3               |  py313h63ebc28_1

In [2]:
import random
import pandas as pd
import os
import numpy as np
# from keras.models import load_model
# from tensorflow.keras.layers import Dense
import glob
import pandas as pd
import numpy as np
import re, shutil, os, random
import multiprocessing as mp
# from Bio.PDB.ResidueDepth import ResidueDepth
# from Bio.PDB import PDBList, PDBParser, NeighborSearch
# from Bio.PDB.PDBParser import PDBParser
import urllib
# import networkx as nx
# from tensorflow.keras.models import Sequential, model_from_json


uniprot_seq = pd.read_excel("../medha/whole_proteome_uniprot_id.xlsx",
                            engine="openpyxl")
uniprot_seq.rename({"From":"uniprot_ids"}, axis=1, inplace =True)
uniprot_seq['gene'] = uniprot_seq["Entry Name"].apply(lambda x: x.split('_')[0])

physicochem_properties_normalized = pd.read_csv('../medha/49_properties_normalizedValues.csv')
physicochem_properties_actual = pd.read_csv('../medha/49_properties_numerical_Values.csv')
new_463 = pd.read_csv('../medha/463_unique_numerical_properties.csv')

mutation_matrices = pd.read_csv('../medha/aaindex_square_diagonal_properties.csv',
                               keep_default_na=False)
                               
# overall_dataset = pd.read_csv('../medha/overall_dataset.csv')                      
mutation_matrices.set_index('wild_mut', inplace=True)

# list_proteins = [i.split('/')[-1].split('.')[0] for i in glob.glob('./data/structure-based-features/dssp/*')]
input1 = ['P04637', 'R175D', 'P04637', 'R175S']
# input1 = form.getfirst("input_seq","0").split()
dataset = pd.DataFrame()
for i in range(0, len(input1)-1,2):
    uniprot_id = input1[i]
    mutation = input1[i+1]
    pos = int(mutation[1:-1])
    sequence = next(iter(uniprot_seq.loc[uniprot_seq['uniprot_ids'] == uniprot_id, 'Sequence']))
    gene = next(iter(uniprot_seq.loc[uniprot_seq['uniprot_ids'] == uniprot_id, 'gene']))
    # if uniprot_id not in list_proteins:
    #     print("The given protein and mutation does not belong to Cancer Gene Census!!")
    # Calculate sequence based features ###########
    dict_to_store = {}
    dict_to_store['UniProt ID'] = uniprot_id
    dict_to_store['Mutation'] = mutation
    #dataset = dataset.append(dict_to_store, ignore_index= True)
    dataset = pd.concat([dataset, pd.DataFrame([dict_to_store])], ignore_index=True)
     
    # parser = PDBParser()
    aa_dict = {"ALA": "A","ARG": "R","ASN": "N","ASP": "D","CYS": "C","GLN": "Q","GLU": "E","GLY": "G","HIS": "H","ILE": "I",
               "LEU": "L","LYS": "K","MET": "M","PHE": "F","PRO": "P","SER": "S","THR": "T","TRP": "W","TYR": "Y","VAL": "V"}
    motif_list = ['nM','Mc', 'n_M', 'M_c', 'n__M','M__c', 'tri']
    # print(mutation_matrices.head())

    physicochem_properties_actual1 = pd.concat([physicochem_properties_normalized, new_463[physicochem_properties_actual.columns]]).reset_index(drop =True)
    aacon_header = ['mut_pos','KABAT', 'JORES', 'SCHNEIDER', 'SHENKIN', 'GERSTEIN',
                    'TAYLOR_GAPS', 'TAYLOR_NO_GAPS', 'VELIBIL', 'KARLIN', 'ARMON',
                    'THOMPSON', 'NOT_LANCET', 'MIRNY', 'WILLIAMSON', 'LANDGRAF',
                    'SANDER', 'VALDAR', 'SMERFS']
    list_prop = []
    all_prop = pd.DataFrame()
    for j in range(len(dataset)):
    # def calls(i, all_prop = all_prop):
        uniprot_id = dataset['UniProt ID'][j]
        mutation = dataset['Mutation'][j]
        wild = mutation[0]
        mut = mutation[-1]
        wild_mut = wild+mut
        pos = int(mutation[1:-1])
        sequence = next(iter(uniprot_seq.loc[uniprot_seq['uniprot_ids'] == uniprot_id, 'Sequence']))
        if pos-7 < 0:
            window_13 = sequence[0:pos+6]
        elif pos+6>len(sequence):
            window_13 = sequence[pos-7:]
        else:
            window_13 = sequence[pos-7:pos+6]\

        gene = next(iter(uniprot_seq.loc[uniprot_seq['uniprot_ids'] == uniprot_id, 'gene']))
        if len(sequence) >= pos and wild == sequence[pos-1]:
            f4 = open(f"../medha/{uniprot_id}.pssm", "r").readlines()[2:]
            f_aacon = pd.read_csv(f"../medha/{uniprot_id}.csv", skiprows=1, names = aacon_header)
            f_disorder = pd.read_csv(f"../medha/{uniprot_id}_IUPred.csv")
            f_residue_depth = pd.read_csv(f"../medha/{uniprot_id}_residue_depth.csv")
            f_plDDT = pd.read_csv(f"../medha/{uniprot_id}_plDDT.out")
            f_dssp = pd.read_csv(f"../medha/{uniprot_id}.out")
            df_network1 = pd.read_csv(f"../medha/{uniprot_id}_network.csv")
            contact_type = [uniprot_id+'_aroaro.csv', uniprot_id+'_arosul.csv', uniprot_id+'_cationpi.csv',
                    uniprot_id+'_disulphide.csv', uniprot_id+'_hbond_main_main.csv', uniprot_id+'_hbond_main_side.csv',
                    uniprot_id+'_hbond_side_side.csv', uniprot_id+'_hydrophobic.csv', uniprot_id+'_ionic.csv']

            dict_prop = dict()
            dict_prop['Wild'] = wild
            dict_prop['Mut'] = mutation[-1]
            dict_prop['Pos'] = pos
            # dict_prop['Class'] = dataset['Class'][i]
            dict_prop['UniProt ID'] = uniprot_id
            dict_prop['Gene Name'] = gene
            dict_prop['Mutation'] = mutation

            a = sum(physicochem_properties_normalized[window_13[k]] for k in range(len(window_13)))/sum(c.isalpha() for c in window_13)
            b = sum(physicochem_properties_actual1[window_13[k]] for k in range(len(window_13)))/sum(c.isalpha() for c in window_13)
            j = 0
            for property1 in physicochem_properties_normalized['index']:
                dict_prop[property1+'_normalize_site_value'] = physicochem_properties_normalized.loc[physicochem_properties_normalized['index'] == property1][wild].tolist()[0]
                dict_prop[property1+'_normalize'] = a[j]
                dict_prop[property1+'_normalize_diff'] = a[j]- physicochem_properties_normalized.loc[physicochem_properties_normalized['index'] == property1][mut].tolist()[0] + physicochem_properties_normalized.loc[physicochem_properties_normalized['index'] == property1][wild].tolist()[0]

                if property1 == "pK'":
                    dict_prop[property1+'_numeric_site_value'] = physicochem_properties_actual1.loc[physicochem_properties_actual1['index'] == "pK'"][wild].tolist()[0]
                    dict_prop[property1+'_numeric_values'] =  b[j]
                    dict_prop[property1+'_numeric_diff'] = b[j]-physicochem_properties_actual1.loc[physicochem_properties_actual1['index'] == "pK'"][mut].tolist()[0]-physicochem_properties_actual1.loc[physicochem_properties_actual1['index'] == "pK'"][wild].tolist()[0]
                else:
                    dict_prop[property1+'_numeric_site_value'] = physicochem_properties_actual1.loc[physicochem_properties_actual1['index'] == "pK'"][wild].tolist()[0]
                    dict_prop[property1+'_numeric_values'] =  b[j]
                    dict_prop[property1+'_numeric_diff'] = b[j]-physicochem_properties_actual1.loc[physicochem_properties_actual1['index'] == "pK'"][mut].tolist()[0]-physicochem_properties_actual1.loc[physicochem_properties_actual1['index'] == "pK'"][wild].tolist()[0]
        #         dict_prop[property1+'_diff'] = physicochem_properties.loc[physicochem_properties['index'] == property1][wild_mut[-1]].tolist()[0] - physicochem_properties.loc[physicochem_properties['index'] == property1][wild_mut[0]].tolist()[0]
                j += 1
            sul_c = len(re.findall('[CM]', window_13))
            pos_c = len(re.findall('[KRH]', window_13))
            aliphatic = len(re.findall('[GALIV]', window_13))
            arom = len(re.findall('[YFW]', window_13))
            neg_c = len(re.findall('[DE]', window_13))
            polar = len(re.findall('[NQSTP]', window_13))
            dict_prop['neg_charge'] = neg_c
            dict_prop['polar'] = polar
            dict_prop['aromatic'] = arom
            dict_prop['S_containing'] = sul_c
            dict_prop['aliphatic'] = aliphatic

            n_ter_wild = wild+sequence[pos-2]
            n_ter_mut = mut+sequence[pos-2]
            if pos >= len(sequence):
                c_ter_wild = wild + '-'
                c_ter_mut = mut + '-'
            else:
                c_ter_wild = wild + sequence[pos]
                c_ter_mut = mut+sequence[pos]
            property_n = mutation_matrices.loc[n_ter_mut]- mutation_matrices.loc[n_ter_wild]
            try:
                property_c = mutation_matrices.loc[c_ter_mut]- mutation_matrices.loc[c_ter_wild]
            except KeyError:
                property_c = mutation_matrices.loc['AG'] - mutation_matrices.loc['AG']
            #print(property_n, property_c, i)
            for mut_prop in property_n.index:
                dict_prop[mut_prop] = property_n[mut_prop]
                dict_prop[mut_prop.lower()] = property_c[mut_prop]
    #         data_with_seq1 = pd.concat([data_with_seq,df_mutation_matrices_c,df_mutation_matrices_n], axis=1)
    ########    Odds ratio based features  #######################################
            if pos>=len(sequence):
                n_ter = '-'
                dipep_n = n_ter+wild
                gap2_n = n_ter+wild
            else:
                n_ter = sequence[pos-2]
                dipep_n = n_ter+wild
                gap2_n = sequence[pos-4]+"**"+wild

            if pos>=len(sequence):
                n_ter_gap = '-'
                dipep_gap_n = n_ter_gap+"*"+wild
            else:
                n_ter_gap = sequence[pos-3]
                dipep_gap_n = n_ter_gap+"*"+wild

            if pos+1 >= len(sequence):
                    c_ter_gap = '-'
                    dipep_gap_c = wild + c_ter_gap
                    gap2_c = wild + c_ter_gap
            else:
                c_ter_gap = sequence[pos+1]
                dipep_gap_c = wild +"*"+ c_ter_gap
        #        print(uni_id,gap2_c)
                try:
                    gap2_c = wild +"**"+ sequence[pos+2]
                except IndexError:
                    gap2_c = wild +"**"+c_ter_gap
            if pos >= len(sequence):
                c_ter = '-'
                dipep_c = wild+c_ter
            else:
                c_ter = sequence[pos]
                dipep_c = wild+c_ter
            tripep = n_ter+ wild+c_ter
            dict_prop['nM'] = dipep_n
            dict_prop['Mc']= dipep_c
            dict_prop['tri'] = tripep
            dict_prop['n_M']=dipep_gap_n
            dict_prop['M_c'] = dipep_gap_c
            dict_prop['n__M'] = gap2_n
            dict_prop['M__c'] = gap2_c
    #        print(df_out['nM'],df_out['Mc'],df_out['tri'],df_out['n_M'],df_out['M_c'],df_out['n__M'],df_out['M__c'])
        # =============================================================================
        # odds ratio
            motif_list = ['nM','Mc', 'n_M', 'M_c', 'n__M','M__c', 'tri']
            for motif in motif_list:
                # print(motif)
                x =  pd.read_excel('../medha/BRCA_odds_ratio.xlsx',sheet_name= motif)
                # print(motif, x.head())
                entry = dict_prop[motif]
                required_odd_ratio = x[x[motif] == entry]
                conditions = [(required_odd_ratio['odd_ratio'] >= 1.2) | (required_odd_ratio['odd_ratio'] == 'inf'),
                (required_odd_ratio['odd_ratio'] < 1.2) & (required_odd_ratio['odd_ratio'] >= 0.9),
                (required_odd_ratio['odd_ratio'] < 0.9)]
                condition_values = [1,3,2]
                dict_prop[motif+'_coded_odds'] = next(iter(np.select(conditions, condition_values)))
                dict_prop[motif+'_odds_ratio'] = next(iter(required_odd_ratio['odd_ratio']))
            # try
            xx_network = df_network1[(df_network1['Wild'] == wild) & (df_network1['pos'] == pos)]
            # print(xx_network)
            if len(xx_network)> 0:
                dict_prop['Degree_centrality'] = next(iter(xx_network['Degree centrality']))
                dict_prop['Closeness_centrality'] = next(iter(xx_network['Closeness centrality']))
                dict_prop['betweenness_centrality'] = next(iter(xx_network['Betweenness centrality']))
                dict_prop['eigenvector_centrality'] = next(iter(xx_network['Eigenvector centrality']))
            else:
                dict_prop['Degree_centrality'] = 0
                dict_prop['Closeness_centrality'] = 0
                dict_prop['betweenness_centrality'] = 0
                dict_prop['eigenvector_centrality'] = 0
            # except
            dict1 = {'A':f4[pos].strip().split()[2:][0], 'R':f4[pos].strip().split()[2:][1],
                     'N':f4[pos].strip().split()[2:][2], 'D':f4[pos].strip().split()[2:][3],
                     'C':f4[pos].strip().split()[2:][4], 'Q':f4[pos].strip().split()[2:][5],
                     'E':f4[pos].strip().split()[2:][6], 'G':f4[pos].strip().split()[2:][7],
                     'H':f4[pos].strip().split()[2:][8], 'I':f4[pos].strip().split()[2:][9],
                     'L':f4[pos].strip().split()[2:][10], 'K':f4[pos].strip().split()[2:][11],
                     'M':f4[pos].strip().split()[2:][12], 'F':f4[pos].strip().split()[2:][13],
                     'P':f4[pos].strip().split()[2:][14], 'S':f4[pos].strip().split()[2:][15],
                     'T':f4[pos].strip().split()[2:][16], 'W':f4[pos].strip().split()[2:][17],
                     'Y':f4[pos].strip().split()[2:][18], 'V':f4[pos].strip().split()[2:][19]}
            dict_prop['pssm_score1'] =  int(dict1[wild])
            #             dict_prop['pssm_score1'] =  dict1[site[0]]
            dict_prop['pssm_score2'] = sum([int(i) for i in dict1.values()])/20
            dict_prop['pssm_score3'] = int(dict1[mutation[-1]])-int(dict1[wild])
            cons= f4[pos].strip('\n')[90:].split()[21:22]
            if len(cons) ==0:
                dict_prop['conservation']= 0
            else:
                for element in cons:
                    dict_prop['conservation']= float(element)
            mm = f_aacon.iloc[pos-1]
            for item in aacon_header:
                dict_prop[item] = mm[item]
            dict_prop['disorder'] = next(iter(f_disorder[f_disorder['Pos'] == pos]['IUPRED SCORE']))
            if pos > len(f_dssp):
                dict_prop['sec_strc'] = random.choice('BEGHIST')
                dict_prop['ASA'] = np.average(f_dssp['ASA'])
                dict_prop['plDDT'] = np.average(f_plDDT['Avg. B-factor'])
                dict_prop['f_residue_depth'] = np.average(f_residue_depth['depth'])
            else:
                dict_prop['sec_strc'] = f_dssp.iloc[pos-1]['SS']
                dict_prop['ASA'] = f_dssp.iloc[pos-1]['ASA']
                dict_prop['plDDT'] = next(iter(f_plDDT[f_plDDT['Residue'] == pos]['Avg. B-factor']))
                dict_prop['f_residue_depth'] = next(iter(f_residue_depth[f_residue_depth['pos'] == pos]['depth']))
            for f_type in contact_type:
                df_bonds = pd.read_csv('../medha/structure_specific_interactions/'+f_type)
                df_bonds['Res1'] = df_bonds['RES1 '].apply(lambda x: aa_dict[x.strip()])
                df_bonds['Res2'] = df_bonds[' RES2 '].apply(lambda x: aa_dict[x.strip()])
                df_bonds.rename({' idRES1 ':'pos1', ' idRES2 ':'pos2'}, axis =1, inplace =True)
                df_bonds['mut_pos'] = df_bonds['Res1']+df_bonds['pos1'].astype(str)
                dict_prop[f_type.split('_')[-1].split('.')[0]] = len(df_bonds[df_bonds['pos1']== pos])
            
            #all_prop = all_prop.append(dict_prop, ignore_index =True)
            all_prop = pd.concat(
            [all_prop, pd.DataFrame([dict_prop])],
            ignore_index=True
            )
            all_prop.replace({'sec_strc':{'B':1, 'E':2, 'H':3, 'S':4, 'T':5, 'I':6, '-':0}}, inplace=True)

all_prop.to_csv('../complete_dataset_with_features.csv', index = False)
           

  warn("Workbook contains no default style, apply openpyxl's default")
  all_prop.replace({'sec_strc':{'B':1, 'E':2, 'H':3, 'S':4, 'T':5, 'I':6, '-':0}}, inplace=True)
  all_prop.replace({'sec_strc':{'B':1, 'E':2, 'H':3, 'S':4, 'T':5, 'I':6, '-':0}}, inplace=True)
  all_prop.replace({'sec_strc':{'B':1, 'E':2, 'H':3, 'S':4, 'T':5, 'I':6, '-':0}}, inplace=True)


In [3]:
"""
Feature generation script (cleaned + labeled)

Inputs expected (per UniProt ID):
- ../medha/whole_proteome_uniprot_id.xlsx                (Sequence, Entry Name, From->uniprot_ids)
- ../medha/49_properties_normalizedValues.csv            (physicochemical properties; normalized)
- ../medha/49_properties_numerical_Values.csv            (physicochemical properties; numeric)
- ../medha/463_unique_numerical_properties.csv           (additional numeric properties)
- ../medha/aaindex_square_diagonal_properties.csv        (AAindex mutation matrices; wild_mut index)
- ../medha/{uniprot_id}.pssm                             (PSSM file)
- ../medha/{uniprot_id}.csv                              (AACon conservation features)
- ../medha/{uniprot_id}_IUPred.csv                        (disorder)
- ../medha/{uniprot_id}_residue_depth.csv                 (residue depth)
- ../medha/{uniprot_id}_plDDT.out                         (plDDT / B-factor per residue)
- ../medha/{uniprot_id}.out                               (DSSP-like output with SS, ASA columns)
- ../medha/{uniprot_id}_network.csv                       (network centrality features)
- ../medha/structure_specific_interactions/{uniprot_id}_*.csv (interaction/contact files)

Output:
- ../complete_dataset_with_features.csv
"""

import random
import re
import numpy as np
import pandas as pd


# -----------------------------
# CONFIG / INPUTS
# -----------------------------
INPUT_MUTATIONS = ['P04637', 'R175D', 'P04637', 'R175S']  # [uniprot, mut, uniprot, mut, ...]
BASE_DIR = "../medha"
OUT_CSV = "../complete_dataset_with_features.csv"

AA3_TO_AA1 = {
    "ALA": "A", "ARG": "R", "ASN": "N", "ASP": "D", "CYS": "C",
    "GLN": "Q", "GLU": "E", "GLY": "G", "HIS": "H", "ILE": "I",
    "LEU": "L", "LYS": "K", "MET": "M", "PHE": "F", "PRO": "P",
    "SER": "S", "THR": "T", "TRP": "W", "TYR": "Y", "VAL": "V"
}

MOTIFS = ['nM', 'Mc', 'n_M', 'M_c', 'n__M', 'M__c', 'tri']

AACON_HEADER = [
    'mut_pos', 'KABAT', 'JORES', 'SCHNEIDER', 'SHENKIN', 'GERSTEIN',
    'TAYLOR_GAPS', 'TAYLOR_NO_GAPS', 'VELIBIL', 'KARLIN', 'ARMON',
    'THOMPSON', 'NOT_LANCET', 'MIRNY', 'WILLIAMSON', 'LANDGRAF',
    'SANDER', 'VALDAR', 'SMERFS'
]

CONTACT_FILES = [
    "_aroaro.csv", "_arosul.csv", "_cationpi.csv", "_disulphide.csv",
    "_hbond_main_main.csv", "_hbond_main_side.csv", "_hbond_side_side.csv",
    "_hydrophobic.csv", "_ionic.csv"
]


# -----------------------------
# LOAD REFERENCE TABLES
# -----------------------------
uniprot_seq = pd.read_excel(f"{BASE_DIR}/whole_proteome_uniprot_id.xlsx", engine="openpyxl")
uniprot_seq.rename({"From": "uniprot_ids"}, axis=1, inplace=True)
uniprot_seq["gene"] = uniprot_seq["Entry Name"].apply(lambda x: x.split("_")[0])

physchem_norm = pd.read_csv(f"{BASE_DIR}/49_properties_normalizedValues.csv")
physchem_num = pd.read_csv(f"{BASE_DIR}/49_properties_numerical_Values.csv")
extra_463 = pd.read_csv(f"{BASE_DIR}/463_unique_numerical_properties.csv")

# NOTE: This creates a numeric-property table with the same columns as physchem_num
physchem_num_all = pd.concat([physchem_norm, extra_463[physchem_num.columns]], ignore_index=True)

mutation_matrices = pd.read_csv(
    f"{BASE_DIR}/aaindex_square_diagonal_properties.csv",
    keep_default_na=False
)
mutation_matrices.set_index("wild_mut", inplace=True)


# -----------------------------
# HELPER FUNCTIONS
# -----------------------------
def get_sequence_and_gene(uniprot_id: str):
    seq = next(iter(uniprot_seq.loc[uniprot_seq["uniprot_ids"] == uniprot_id, "Sequence"]))
    gene = next(iter(uniprot_seq.loc[uniprot_seq["uniprot_ids"] == uniprot_id, "gene"]))
    return seq, gene


def window_13mer(sequence: str, pos_1based: int) -> str:
    """13-aa window centered on position; handles ends."""
    pos0 = pos_1based - 1
    left = max(0, pos0 - 6)
    right = min(len(sequence), pos0 + 7)
    return sequence[left:right]


def safe_get(series, default=0):
    try:
        return next(iter(series))
    except StopIteration:
        return default


def compute_dipeptide_motifs(sequence: str, pos_1based: int, wild: str):
    """Motifs used for odds-ratio lookup."""
    pos0 = pos_1based - 1

    # N-terminal neighbors
    n1 = sequence[pos0 - 1] if pos0 - 1 >= 0 else '-'
    n2 = sequence[pos0 - 2] if pos0 - 2 >= 0 else '-'
    n4 = sequence[pos0 - 4] if pos0 - 4 >= 0 else '-'

    # C-terminal neighbors
    c1 = sequence[pos0 + 1] if pos0 + 1 < len(sequence) else '-'
    c2 = sequence[pos0 + 2] if pos0 + 2 < len(sequence) else '-'
    c0 = sequence[pos0 + 0] if pos0 < len(sequence) else '-'

    motifs = {}
    motifs["nM"] = n1 + wild
    motifs["Mc"] = wild + c0
    motifs["tri"] = n1 + wild + c0
    motifs["n_M"] = n2 + "*" + wild
    motifs["M_c"] = wild + "*" + c1
    motifs["n__M"] = (n4 + "**" + wild) if n4 != '-' else ('-' + wild)
    motifs["M__c"] = wild + "**" + (c2 if c2 != '-' else c1)

    return motifs


def add_physchem_features(dict_prop, wild, mut, win13):
    """
    Feature group: Physicochemical properties
    - site values (wild)
    - window mean (13-mer)
    - diff-like features
    """
    # window means across residues (ignoring non-letters)
    denom = sum(c.isalpha() for c in win13)
    a = sum(physchem_norm[win13[k]] for k in range(len(win13))) / denom
    b = sum(physchem_num_all[win13[k]] for k in range(len(win13))) / denom

    j = 0
    for prop_name in physchem_norm["index"]:
        # normalized
        wild_norm = physchem_norm.loc[physchem_norm["index"] == prop_name, wild].tolist()[0]
        mut_norm = physchem_norm.loc[physchem_norm["index"] == prop_name, mut].tolist()[0]
        dict_prop[f"{prop_name}_normalize_site_value"] = wild_norm
        dict_prop[f"{prop_name}_normalize"] = a[j]
        dict_prop[f"{prop_name}_normalize_diff"] = a[j] - mut_norm + wild_norm

        # numeric (FIX: original code accidentally used "pK'" row for everything)
        wild_num = physchem_num_all.loc[physchem_num_all["index"] == prop_name, wild].tolist()[0]
        mut_num = physchem_num_all.loc[physchem_num_all["index"] == prop_name, mut].tolist()[0]
        dict_prop[f"{prop_name}_numeric_site_value"] = wild_num
        dict_prop[f"{prop_name}_numeric_values"] = b[j]
        dict_prop[f"{prop_name}_numeric_diff"] = b[j] - mut_num + wild_num

        j += 1

    # Feature group: amino-acid class composition in 13-mer window
    dict_prop["neg_charge"] = len(re.findall("[DE]", win13))
    dict_prop["polar"] = len(re.findall("[NQSTP]", win13))
    dict_prop["aromatic"] = len(re.findall("[YFW]", win13))
    dict_prop["S_containing"] = len(re.findall("[CM]", win13))
    dict_prop["aliphatic"] = len(re.findall("[GALIV]", win13))


def add_mutation_matrix_features(dict_prop, sequence, pos, wild, mut):
    """
    Feature group: AAIndex mutation matrices
    - N-terminal context (mut_prop)
    - C-terminal context (lowercase mut_prop)
    """
    pos0 = pos - 1

    # N-terminal dipeptide: (wild + previous residue)
    prev_res = sequence[pos0 - 1] if pos0 - 1 >= 0 else "-"
    n_wild = wild + prev_res
    n_mut = mut + prev_res

    # C-terminal dipeptide: (wild + next residue)
    next_res = sequence[pos0 + 1] if pos0 + 1 < len(sequence) else "-"
    c_wild = wild + next_res
    c_mut = mut + next_res

    property_n = mutation_matrices.loc[n_mut] - mutation_matrices.loc[n_wild]
    try:
        property_c = mutation_matrices.loc[c_mut] - mutation_matrices.loc[c_wild]
    except KeyError:
        property_c = mutation_matrices.loc["AG"] - mutation_matrices.loc["AG"]

    for mut_prop in property_n.index:
        dict_prop[mut_prop] = property_n[mut_prop]
        dict_prop[mut_prop.lower()] = property_c[mut_prop]


def add_odds_ratio_features(dict_prop, motifs):
    """
    Feature group: Motif odds-ratio (from Excel sheets per motif)
    Creates:
    - {motif}_coded_odds
    - {motif}_odds_ratio
    """
    for motif in MOTIFS:
        x = pd.read_excel(f"{BASE_DIR}/BRCA_odds_ratio.xlsx", sheet_name=motif)
        entry = motifs[motif]
        row = x[x[motif] == entry]

        odd = safe_get(row["odd_ratio"], default=0)

        # Coded odds bins
        conditions = [
            (row["odd_ratio"] >= 1.2) | (row["odd_ratio"] == "inf"),
            (row["odd_ratio"] < 1.2) & (row["odd_ratio"] >= 0.9),
            (row["odd_ratio"] < 0.9),
        ]
        code_vals = [1, 3, 2]
        dict_prop[f"{motif}_coded_odds"] = safe_get(np.select(conditions, code_vals), default=0)
        dict_prop[f"{motif}_odds_ratio"] = odd


def add_network_features(dict_prop, df_network, wild, pos):
    """
    Feature group: Structural network centrality
    """
    xx = df_network[(df_network["Wild"] == wild) & (df_network["pos"] == pos)]
    if len(xx) > 0:
        dict_prop["Degree_centrality"] = safe_get(xx["Degree centrality"], 0)
        dict_prop["Closeness_centrality"] = safe_get(xx["Closeness centrality"], 0)
        dict_prop["betweenness_centrality"] = safe_get(xx["Betweenness centrality"], 0)
        dict_prop["eigenvector_centrality"] = safe_get(xx["Eigenvector centrality"], 0)
    else:
        dict_prop["Degree_centrality"] = 0
        dict_prop["Closeness_centrality"] = 0
        dict_prop["betweenness_centrality"] = 0
        dict_prop["eigenvector_centrality"] = 0


def add_pssm_features(dict_prop, f4_lines, pos, wild, mut):
    """
    Feature group: PSSM / conservation
    """
    # PSSM line indexing: original code uses f4[pos] (1-basedish). Keep as-is to match their files.
    cols = f4_lines[pos].strip().split()

    aa_scores = cols[2:22]  # 20 AAs
    aa_order = list("ARNDCQEGHILKMFPSTWYV")

    score_map = dict(zip(aa_order, aa_scores))
    score_map = {k: int(v) for k, v in score_map.items()}

    dict_prop["pssm_score1"] = score_map.get(wild, 0)
    dict_prop["pssm_score2"] = sum(score_map.values()) / 20.0
    dict_prop["pssm_score3"] = score_map.get(mut, 0) - score_map.get(wild, 0)

    # Conservation: original code slices at char 90; keep behavior
    cons = f4_lines[pos].strip("\n")[90:].split()[21:22]
    dict_prop["conservation"] = float(cons[0]) if len(cons) else 0.0


def add_aacon_features(dict_prop, f_aacon, pos):
    """
    Feature group: AACon conservation features (per residue)
    """
    mm = f_aacon.iloc[pos - 1]
    for item in AACON_HEADER:
        dict_prop[item] = mm[item]


def add_disorder_depth_plddt_dssp(dict_prop, f_disorder, f_depth, f_plddt, f_dssp, pos):
    """
    Feature group: Disorder + structure features
    - disorder (IUPred)
    - DSSP secondary structure + ASA
    - plDDT
    - residue depth
    """
    dict_prop["disorder"] = safe_get(
        f_disorder.loc[f_disorder["Pos"] == pos, "IUPRED SCORE"],
        default=0
    )

    # DSSP and other structure files: if residue missing, use averages/random SS
    if pos > len(f_dssp):
        dict_prop["sec_strc"] = random.choice(list("BEGHIST-"))
        dict_prop["ASA"] = float(np.average(f_dssp["ASA"])) if len(f_dssp) else 0
        dict_prop["plDDT"] = float(np.average(f_plddt["Avg. B-factor"])) if len(f_plddt) else 0
        dict_prop["f_residue_depth"] = float(np.average(f_depth["depth"])) if len(f_depth) else 0
    else:
        dict_prop["sec_strc"] = f_dssp.iloc[pos - 1]["SS"]
        dict_prop["ASA"] = f_dssp.iloc[pos - 1]["ASA"]
        dict_prop["plDDT"] = safe_get(
            f_plddt.loc[f_plddt["Residue"] == pos, "Avg. B-factor"],
            default=0
        )
        dict_prop["f_residue_depth"] = safe_get(
            f_depth.loc[f_depth["pos"] == pos, "depth"],
            default=0
        )


def add_structure_interaction_counts(dict_prop, uniprot_id, pos):
    """
    Feature group: Structure-specific interaction/contact counts
    Each file contributes one count feature keyed by interaction type.
    """
    for suffix in CONTACT_FILES:
        f_type = f"{uniprot_id}{suffix}"
        df_bonds = pd.read_csv(f"{BASE_DIR}/structure_specific_interactions/{f_type}")

        # Normalize residue labels and positions
        df_bonds["Res1"] = df_bonds["RES1 "].apply(lambda x: AA3_TO_AA1[x.strip()])
        df_bonds["Res2"] = df_bonds[" RES2 "].apply(lambda x: AA3_TO_AA1[x.strip()])
        df_bonds.rename({" idRES1 ": "pos1", " idRES2 ": "pos2"}, axis=1, inplace=True)

        interaction_name = f_type.split("_")[-1].split(".")[0]
        dict_prop[interaction_name] = int((df_bonds["pos1"] == pos).sum())


# -----------------------------
# MAIN: BUILD FEATURE TABLE
# -----------------------------
all_rows = []

for i in range(0, len(INPUT_MUTATIONS) - 1, 2):
    uniprot_id = INPUT_MUTATIONS[i]
    mutation = INPUT_MUTATIONS[i + 1]

    wild = mutation[0]
    mut = mutation[-1]
    pos = int(mutation[1:-1])  # 1-based residue index

    sequence, gene = get_sequence_and_gene(uniprot_id)

    # Basic validation: position exists and wild matches sequence at that position
    if not (1 <= pos <= len(sequence)) or sequence[pos - 1] != wild:
        # Skip invalid entries (instead of silently doing nothing)
        continue

    win13 = window_13mer(sequence, pos)
    motifs = compute_dipeptide_motifs(sequence, pos, wild)

    # Load per-protein auxiliary files
    f4_lines = open(f"{BASE_DIR}/{uniprot_id}.pssm", "r").readlines()[2:]
    f_aacon = pd.read_csv(f"{BASE_DIR}/{uniprot_id}.csv", skiprows=1, names=AACON_HEADER)
    f_disorder = pd.read_csv(f"{BASE_DIR}/{uniprot_id}_IUPred.csv")
    f_depth = pd.read_csv(f"{BASE_DIR}/{uniprot_id}_residue_depth.csv")
    f_plddt = pd.read_csv(f"{BASE_DIR}/{uniprot_id}_plDDT.out")
    f_dssp = pd.read_csv(f"{BASE_DIR}/{uniprot_id}.out")
    df_network = pd.read_csv(f"{BASE_DIR}/{uniprot_id}_network.csv")

    # -----------------------------
    # Feature dictionary (one row)
    # -----------------------------
    dict_prop = {
        # Identifiers / metadata
        "UniProt ID": uniprot_id,
        "Gene Name": gene,
        "Mutation": mutation,
        "Wild": wild,
        "Mut": mut,
        "Pos": pos,
    }

    # Feature group: physicochemical properties (site + window + diffs + AA-class counts)
    add_physchem_features(dict_prop, wild, mut, win13)

    # Feature group: AAIndex mutation matrix deltas (N-term + C-term contexts)
    add_mutation_matrix_features(dict_prop, sequence, pos, wild, mut)

    # Feature group: local sequence motifs (for odds ratio)
    for k, v in motifs.items():
        dict_prop[k] = v

    # Feature group: odds ratio features (coded + raw)
    add_odds_ratio_features(dict_prop, motifs)

    # Feature group: network centrality
    add_network_features(dict_prop, df_network, wild, pos)

    # Feature group: PSSM features
    add_pssm_features(dict_prop, f4_lines, pos, wild, mut)

    # Feature group: AACon conservation features
    add_aacon_features(dict_prop, f_aacon, pos)

    # Feature group: disorder + DSSP ASA/SS + plDDT + residue depth
    add_disorder_depth_plddt_dssp(dict_prop, f_disorder, f_depth, f_plddt, f_dssp, pos)

    # Feature group: structure-specific interaction counts
    add_structure_interaction_counts(dict_prop, uniprot_id, pos)

    # Encode secondary structure to numeric (as in original)
    sec_map = {'B': 1, 'E': 2, 'H': 3, 'S': 4, 'T': 5, 'I': 6, '-': 0}
    dict_prop["sec_strc"] = sec_map.get(dict_prop.get("sec_strc", "-"), 0)

    all_rows.append(dict_prop)


# -----------------------------
# WRITE OUTPUT
# -----------------------------
all_prop = pd.DataFrame(all_rows)
all_prop.to_csv(OUT_CSV, index=False)
print(f"Wrote: {OUT_CSV}")


  warn("Workbook contains no default style, apply openpyxl's default")


Wrote: ../complete_dataset_with_features.csv
