# **0.1 Preprocesamiento de los datos**

In [31]:
# Importamos librería 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import os

In [32]:
# Cargar datos
data_dir = r"C:\Users\Macarena Madrid\Desktop\\alzheimer-target-prediction\data\raw"

ppi = pd.read_csv(os.path.join(data_dir, "1_PPI-network_Alzheimer_Disease_no-opentarget-filter.csv"))
targets = pd.read_excel(os.path.join(data_dir, "3_Targets-score_Alzheimer_Disease_no-opentarget-filter.xlsx"))
topo = pd.read_csv(os.path.join(data_dir, "topology_158_up_down_normalized_splitPCA.csv"))
enrichment = pd.read_excel(os.path.join(data_dir, "enrichment2.xlsx"), sheet_name=None)  # Todas las hojas

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


## Datos de PPI

In [56]:
# obtenemos la infromación general del dataset de ppi
ppi.info()
ppi.describe() # estadísticas descriptivas

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426139 entries, 0 to 426138
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Prot_A             426139 non-null  object 
 1   Prot_B             426139 non-null  object 
 2   Interaction_score  426139 non-null  float64
 3   Disease            426139 non-null  object 
dtypes: float64(1), object(3)
memory usage: 13.0+ MB


Unnamed: 0,Interaction_score
count,426139.0
mean,0.621208
std,0.180884
min,0.4
25%,0.473
50%,0.574
75%,0.728
max,0.999


In [67]:
# Conteo
print("Nº total de interacciones:", len(ppi))
print("Nº proteínas únicas:", len(pd.unique(ppi[['Prot_A', 'Prot_B']].values.ravel())))
print("Nº duplicados:", ppi.duplicated().sum())
print("Nº self-loops:", (ppi['Prot_A'] == ppi['Prot_B']).sum())



Nº total de interacciones: 308736
Nº proteínas únicas: 5391
Nº duplicados: 0
Nº self-loops: 0


In [59]:
# valores faltante por columnas
ppi.isnull().sum()

Prot_A               0
Prot_B               0
Interaction_score    0
Disease              0
dtype: int64

In [66]:
print(len(ppi))

308736


No hay valores faltantes, pero si es necesario eliminar la columna Disease ya que sabemos que solo se trabajarán con datos de Alzheimer.

Interaction_score va de 0 a 1, y queremos ver o analizar las interacciones menos confiables, entonces se filtraran por <0.7 que corresponden a las interacciones de baja a mediana confianza, excluyendo las de alta o muy alta confianza (>= 0.7).

In [None]:
processed_dir = r"C:\Users\Macarena Madrid\Desktop\\alzheimer-target-prediction\data\processed\raw_cleaned"

# Quedarse con interacciones con score < 0.7 
ppi_filtered= ppi[ppi['Interaction_score'] < 0.7].copy()

# Eliminar columna 'Disease' del dataset de interacciones
ppi_clean = ppi_filtered.drop(columns=["Disease"])

# Guardar la versión limpia
ppi_clean.to_csv(os.path.join(processed_dir, "ppi_clean.csv"), index=False)

# Verificación
print("Archivo guardado en:", os.path.join(processed_dir, "ppi_clean.csv"))
print("Total de interacciones después de limpieza:", len(ppi_clean))

Archivo guardado en: C:\Users\Macarena Madrid\Desktop\\alzheimer-target-prediction\data\processed\raw_cleaned\ppi_clean.csv
Total de interacciones después de limpieza: 308736


Finalmente me quede con 308736 de 426139

## Datos de target

In [50]:
# obtenemos la infromación general del dataset de targets
targets.info()
targets.describe() # estadísticas descriptivas

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5411 entries, 0 to 5410
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Target_name                    5411 non-null   object 
 1   Complex_participants           1494 non-null   object 
 2   Node_id                        5411 non-null   object 
 3   Uniprot_id                     5411 non-null   object 
 4   Target_type                    5411 non-null   object 
 5   Target_group                   5411 non-null   object 
 6   Source_db                      5411 non-null   object 
 7   Target_group_score             5411 non-null   float64
 8   Target_group_score_normalized  5411 non-null   float64
 9   Conectivity_Score              3842 non-null   float64
 10  Disease                        5411 non-null   object 
dtypes: float64(3), object(8)
memory usage: 465.1+ KB


Unnamed: 0,Target_group_score,Target_group_score_normalized,Conectivity_Score
count,5411.0,5411.0,3842.0
mean,1.630882,0.186112,0.730901
std,2.57031,0.14458,0.207526
min,0.1,0.1,0.34932
25%,0.1,0.1,0.605366
50%,0.1,0.1,0.715035
75%,5.0,0.375625,0.828582
max,16.1,1.0,2.627325


In [51]:
# valores faltante por columnas
targets.isnull().sum()

Target_name                         0
Complex_participants             3917
Node_id                             0
Uniprot_id                          0
Target_type                         0
Target_group                        0
Source_db                           0
Target_group_score                  0
Target_group_score_normalized       0
Conectivity_Score                1569
Disease                             0
dtype: int64

Complex_participants me da 3917 valores nulos, ya que no todos los blancos son complejos proteicos.
Conectivity_Score tiene 1569 valores nulos, se considerarán como una conectividad mínima o no disponible.
Se rellenarán con 0 ambos datos nulos.


In [52]:
# Conteo de targets en el DataFrame de targets
num_targets = targets['Node_id'].nunique()
print(f"Número de targets en el DataFrame de targets: {num_targets}")

Número de targets en el DataFrame de targets: 5411


In [53]:
# Obtenemos la cantidad de valores por clase de la variable "Target_group"

targets.Target_group.value_counts() 

Target_group
T4                3842
T2                1090
T2, T3             198
T2, T4             198
T2, T3, T4          29
T1, T2, T3          22
T3                  15
T1, T2               9
T1, T2, T3, T4       7
T3, T4               1
Name: count, dtype: int64

In [54]:
# Obtenemos la cantidad de valores por clase sin combinaciones de la variable "Target_group"
from collections import Counter

# Contar ocurrencias individuales en combinaciones
conteo = Counter()
targets["Target_group"].dropna().astype(str).str.split(",").apply(lambda grupos: conteo.update([g.strip() for g in grupos]))

for t in ["T1", "T2", "T3", "T4"]:
    print(f"{t}: {conteo[t]}")


T1: 38
T2: 1553
T3: 272
T4: 4077


El total corresponde a 5411 datos, donde el 61.7% equivale al T4, 23.5% a T3, 4.1% a T3 y 0.8% a T1. Lo que refleja que existe un desvalance de clases. Este sesgo puede afectar el rendimiento de los modelos supervisados.

In [55]:
processed_dir = r"C:\Users\Macarena Madrid\Desktop\\alzheimer-target-prediction\data\processed\raw_cleaned"

# Rellenar valores nulos en el DataFrame original
targets["Complex_participants"] = targets["Complex_participants"].fillna("None")
targets["Conectivity_Score"] = targets["Conectivity_Score"].fillna(0)

# Eliminar columnas innecesarias
target_clean = targets.drop(columns=[
    "Target_group_score",        # ya tienes la versión normalizada
    "Disease",                   # no se usará
    "Source_db",                 # metadato no funcional
    "Uniprot_id"                 # info redundante, no como feature
])

# Verificar columnas finales
print("Columnas finales:", target_clean.columns.tolist())

# Guardar la versión limpia
target_clean.to_csv(os.path.join(processed_dir, "target_clean.csv"), index=False)

# Verificación
print("Archivo guardado en:", os.path.join(processed_dir, "target_clean.csv"))


Columnas finales: ['Target_name', 'Complex_participants', 'Node_id', 'Target_type', 'Target_group', 'Target_group_score_normalized', 'Conectivity_Score']
Archivo guardado en: C:\Users\Macarena Madrid\Desktop\\alzheimer-target-prediction\data\processed\raw_cleaned\target_clean.csv


## Datos de topología

In [7]:
# obtenemos la infromación general del dataset de topologia
topo.info()
topo.describe() # estadísticas descriptivas

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Node_id            158 non-null    object 
 1   prediction         158 non-null    object 
 2   DC                 158 non-null    float64
 3   BC                 158 non-null    float64
 4   CC                 158 non-null    float64
 5   EC                 158 non-null    float64
 6   CEN                158 non-null    float64
 7   ECC                158 non-null    float64
 8   RAD                158 non-null    float64
 9   Conectivity_Score  158 non-null    float64
dtypes: float64(8), object(2)
memory usage: 12.5+ KB


Unnamed: 0,DC,BC,CC,EC,CEN,ECC,RAD,Conectivity_Score
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,0.230279,0.100278,0.495669,0.225283,0.592059,0.556962,0.608665,0.426127
std,0.212538,0.169508,0.187579,0.236011,0.245694,0.432644,0.183501,0.195098
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.053674,0.016085,0.377904,0.020636,0.422149,0.0,0.51386,0.291396
50%,0.168254,0.043192,0.503034,0.193621,0.61064,0.4,0.628032,0.424866
75%,0.348016,0.105659,0.60609,0.337291,0.783107,1.0,0.722278,0.565477
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
# valores faltante por columnas
topo.isnull().sum()

Node_id              0
prediction           0
DC                   0
BC                   0
CC                   0
EC                   0
CEN                  0
ECC                  0
RAD                  0
Conectivity_Score    0
dtype: int64

## Datos de enriquecimiento

In [10]:
# obtenemos la infromación general del dataset de enrichment
enrichment_info = {sheet: df.info() for sheet, df in enrichment.items()}
enrichment_describe = {sheet: df.describe() for sheet, df in enrichment.items()}

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5352 entries, 0 to 5351
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Rank                  5352 non-null   int64  
 1   Term Name             5352 non-null   object 
 2   P-value               5352 non-null   float64
 3   Z-score               5352 non-null   float64
 4   Combined_Score        5352 non-null   float64
 5   Overlapping genes     5352 non-null   object 
 6   Adjusted p-value      5352 non-null   float64
 7   Old p-value           5352 non-null   int64  
 8   Old Adjusted p-value  5352 non-null   int64  
dtypes: float64(4), int64(3), object(2)
memory usage: 376.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1109 entries, 0 to 1108
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Rank                  1109 non-null   int64  
 1   Term 

In [23]:
# valores faltante por columnas
print("Valores nulos en Enrichment:", {sheet: df.isnull().sum() for sheet, df in enrichment.items()})



Valores nulos en Enrichment: {'Biological process': Rank                    0
Term Name               0
P-value                 0
Z-score                 0
Combined_Score          0
Overlapping genes       0
Adjusted p-value        0
Old p-value             0
Old Adjusted p-value    0
dtype: int64, 'Molecular function': Rank                    0
Term Name               0
P-value                 0
Z-score                 0
Combined_Score          0
Overlapping genes       0
Adjusted p-value        0
Old p-value             0
Old Adjusted p-value    0
dtype: int64, 'Cellular component': Rank                    0
Term Name               0
P-value                 0
Z-score                 0
Combined_Score          0
Overlapping genes       0
Adjusted p-value        0
Old p-value             0
Old Adjusted p-value    0
dtype: int64}
