# **0.1 Preprocesamiento de los datos**

In [1]:
# Importamos librería 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import os

In [3]:
# Cargar datos
data_dir = os.path.join("..","data","raw")

ppi = pd.read_csv(os.path.join(data_dir, "1_PPI-network_Alzheimer_Disease_no-opentarget-filter.csv"))
targets = pd.read_excel(os.path.join(data_dir, "3_Targets-score_Alzheimer_Disease_no-opentarget-filter.xlsx"))
topo = pd.read_csv(os.path.join(data_dir, "topology_158_up_down_targets.csv"))
enrichment = pd.read_excel(os.path.join(data_dir, "enrichment2.xlsx"), sheet_name=None)  # Todas las hojas

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


## Datos de PPI

In [5]:
# obtenemos la infromación general del dataset de ppi
ppi.info()
ppi.describe() # estadísticas descriptivas

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 430206 entries, 0 to 430205
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Prot_A             430206 non-null  object 
 1   Prot_B             430206 non-null  object 
 2   Interaction_score  430206 non-null  float64
 3   Disease            430206 non-null  object 
dtypes: float64(1), object(3)
memory usage: 13.1+ MB


Unnamed: 0,Interaction_score
count,430206.0
mean,0.621486
std,0.180953
min,0.4
25%,0.473
50%,0.575
75%,0.728
max,0.999


In [6]:
# Conteo
print("Nº total de interacciones:", len(ppi))
print("Nº proteínas únicas:", len(pd.unique(ppi[['Prot_A', 'Prot_B']].values.ravel())))
print("Nº duplicados:", ppi.duplicated().sum())
print("Nº self-loops:", (ppi['Prot_A'] == ppi['Prot_B']).sum())



Nº total de interacciones: 430206
Nº proteínas únicas: 5390
Nº duplicados: 0
Nº self-loops: 0


In [7]:
# valores faltante por columnas
ppi.isnull().sum()

Prot_A               0
Prot_B               0
Interaction_score    0
Disease              0
dtype: int64

In [8]:
print(len(ppi))

430206


No hay valores faltantes, pero si es necesario eliminar la columna Disease ya que sabemos que solo se trabajarán con datos de Alzheimer.

Interaction_score va de 0 a 1, y queremos ver o analizar las interacciones menos confiables, por eso los datos de Interaction_Score van desde 0.4

In [10]:
processed_dir = os.path.join("..","data","processed","raw_cleaned")

# Eliminar columna 'Disease' del dataset de interacciones
ppi_clean = ppi.drop(columns=["Disease"])

# Guardar la versión limpia
ppi_clean.to_csv(os.path.join(processed_dir, "ppi_clean.csv"), index=False)

# Verificación
print("Archivo guardado en:", os.path.join(processed_dir, "ppi_clean.csv"))
print("Total de interacciones después de limpieza:", len(ppi_clean))

Archivo guardado en: ..\data\processed\raw_cleaned\ppi_clean.csv
Total de interacciones después de limpieza: 430206


In [11]:
# Ver primeras filas del dataset limpio
print(ppi_clean.head())

  Prot_A    Prot_B  Interaction_score
0   DRD4      HTR7              0.441
1   DRD4       LEP              0.457
2   DRD4  CPX-2175              0.509
3   DRD4   CPX-271              0.509
4   DRD4   CPX-272              0.509


## Datos de target

In [12]:
# obtenemos la infromación general del dataset de targets
targets.info()
targets.describe() # estadísticas descriptivas

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5390 entries, 0 to 5389
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Target_name                    5390 non-null   object 
 1   Complex_participants           1487 non-null   object 
 2   Node_id                        5390 non-null   object 
 3   Uniprot_id                     5390 non-null   object 
 4   Target_type                    5390 non-null   object 
 5   Target_group                   5390 non-null   object 
 6   Source_db                      5390 non-null   object 
 7   Target_group_score             5390 non-null   float64
 8   Target_group_score_normalized  5390 non-null   float64
 9   Conectivity_Score              3821 non-null   float64
 10  Disease                        5390 non-null   object 
dtypes: float64(3), object(8)
memory usage: 463.3+ KB


Unnamed: 0,Target_group_score,Target_group_score_normalized,Conectivity_Score
count,5390.0,5390.0,3821.0
mean,1.635213,0.186356,0.732535
std,2.573158,0.14474,0.210551
min,0.1,0.1,0.348508
25%,0.1,0.1,0.606389
50%,0.1,0.1,0.715504
75%,5.0,0.375625,0.830332
max,16.1,1.0,2.628702


In [14]:
# valores faltante por columnas
targets.isnull().sum()

Target_name                         0
Complex_participants             3903
Node_id                             0
Uniprot_id                          0
Target_type                         0
Target_group                        0
Source_db                           0
Target_group_score                  0
Target_group_score_normalized       0
Conectivity_Score                1569
Disease                             0
dtype: int64

Complex_participants me da 3902 valores nulos, ya que no todos los blancos son complejos proteicos.
Conectivity_Score tiene 1569 valores nulos, se considerarán como una conectividad mínima o no disponible.
Se rellenarán con 0 ambos datos nulos.


In [15]:
# Conteo de targets en el DataFrame de targets
num_targets = targets['Node_id'].nunique()
print(f"Número de targets en el DataFrame de targets: {num_targets}")

Número de targets en el DataFrame de targets: 5390


In [16]:
# Obtenemos la cantidad de valores por clase de la variable "Target_group"

targets.Target_group.value_counts() 

Target_group
T4                3823
T2                1086
T2, T3             200
T2, T4             199
T2, T3, T4          28
T1, T2, T3          22
T3                  15
T1, T2               9
T1, T2, T3, T4       7
T3, T4               1
Name: count, dtype: int64

In [40]:
# Agrupamos por Target_group y mostramos los valores únicos del score
targets.groupby("Target_group")["Target_group_score_normalized"].unique()
# sort
targets.groupby("Target_group")["Target_group_score_normalized"].mean().sort_values(ascending=False)


Target_group
T1, T2, T3, T4    1.000000
T1, T2, T3        0.994375
T1, T2            0.938125
T2, T3, T4        0.437500
T2, T3            0.431875
T2, T4            0.381250
T2                0.375625
T3, T4            0.156250
T3                0.150625
T4                0.100000
Name: Target_group_score_normalized, dtype: float64

In [36]:
# Obtenemos la cantidad de valores por clase sin combinaciones de la variable "Target_group"
from collections import Counter

# Contar ocurrencias individuales en combinaciones
conteo = Counter()
targets["Target_group"].dropna().astype(str).str.split(",").apply(lambda grupos: conteo.update([g.strip() for g in grupos]))

for t in ["T1", "T2", "T3", "T4"]:
    print(f"{t}: {conteo[t]}")


T1: 38
T2: 1551
T3: 273
T4: 4058


In [37]:
# procentaje de ocurrencias individuales
total = sum(conteo.values())
for t in ["T1", "T2", "T3", "T4"]:
    porcentaje = (conteo[t] / total) * 100
    print(f"{t}: {conteo[t]} ({porcentaje:.2f}%)")
    

T1: 38 (0.64%)
T2: 1551 (26.20%)
T3: 273 (4.61%)
T4: 4058 (68.55%)


In [20]:
# porcentaje de targets por grupo
total_targets = len(targets)
porcentaje_targets = targets['Target_group'].value_counts(normalize=True) * 100
print("\nPorcentaje de targets por grupo:")
for group, percentage in porcentaje_targets.items():
    print(f"{group}: {percentage:.2f}%")
    


Porcentaje de targets por grupo:
T4: 70.93%
T2: 20.15%
T2, T3: 3.71%
T2, T4: 3.69%
T2, T3, T4: 0.52%
T1, T2, T3: 0.41%
T3: 0.28%
T1, T2: 0.17%
T1, T2, T3, T4: 0.13%
T3, T4: 0.02%


In [18]:
# Tipos de target
targets['Target_type'].value_counts()

Target_type
SINGLE PROTEIN     3916
PROTEIN COMPLEX    1474
Name: count, dtype: int64

In [22]:
processed_dir = os.path.join("..","data","processed","raw_cleaned")

# Rellenar valores nulos en el DataFrame original
targets["Complex_participants"] = targets["Complex_participants"].fillna("0")
targets["Conectivity_Score"] = targets["Conectivity_Score"].fillna(0)

# Eliminar columnas innecesarias
target_clean = targets.drop(columns=[
    "Target_group_score",        # ya tienes la versión normalizada
    "Disease",                   # no se usará
    "Source_db",                 # metadato no funcional
    "Uniprot_id"                 # info redundante, no como feature
])

# Verificar columnas finales
print("Columnas finales:", target_clean.columns.tolist())

# Guardar la versión limpia
target_clean.to_csv(os.path.join(processed_dir, "target_clean.csv"), index=False)

# Verificación
print("Archivo guardado en:", os.path.join(processed_dir, "target_clean.csv"))



Columnas finales: ['Target_name', 'Complex_participants', 'Node_id', 'Target_type', 'Target_group', 'Target_group_score_normalized', 'Conectivity_Score']
Archivo guardado en: ..\data\processed\raw_cleaned\target_clean.csv


In [23]:
target_clean.head()

Unnamed: 0,Target_name,Complex_participants,Node_id,Target_type,Target_group,Target_group_score_normalized,Conectivity_Score
0,glutamate nmda receptor; grin1/grin2a,"GRIN2D,GRIN1",CPX-289,PROTEIN COMPLEX,"T1, T2, T3, T4",1.0,0.0
1,dna replication factor cdt1 (double parked hom...,"CDT1,GMNN",CPX-659,PROTEIN COMPLEX,"T1, T2, T3, T4",1.0,0.0
2,geminin,"GMNN,MCIDAS",CPX-661,PROTEIN COMPLEX,"T1, T2, T3, T4",1.0,0.0
3,nuclear receptor coactivator 2 (ncoa-2) (class...,"NCOA2,PPARG",CPX-702,PROTEIN COMPLEX,"T1, T2, T3, T4",1.0,0.0
4,nuclear receptor coactivator 1 (ncoa-1) (ec 2....,"NCOA1,PPARG",CPX-711,PROTEIN COMPLEX,"T1, T2, T3, T4",1.0,0.0


### Gráficas para Targets

In [24]:
# Cargar los datos limpios
data_dir = os.path.join("..","data","processed","raw_cleaned")
targets_c = pd.read_csv(os.path.join(data_dir, "target_clean.csv"))

In [30]:
# Distribución de clases Target_group (frecuencia y procentaje)

fig_dir = os.path.join("..","results","figures")

plt.figure(figsize=(8, 5))
targets_c["Target_group"].value_counts().plot(kind="bar", color="skyblue")
plt.title("Distribución de clases en Target_group")
plt.ylabel("Frecuencia")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(fig_dir, "target_group_distribution.png"))
plt.close()

In [31]:
# Distribución del Conectivity_Score
fig_dir = os.path.join("..","results","figures")

plt.figure(figsize=(6, 4))
sns.histplot(targets_c["Conectivity_Score"], bins=30, kde=True)
plt.title("Distribución de Conectivity_Score")
plt.xlabel("Conectivity Score")
plt.tight_layout()
plt.savefig(os.path.join(fig_dir, "conectivity_score_distribution.png"))
plt.close()

## Datos de topología

In [27]:
# obtenemos la infromación general del dataset de topologia
topo.info()
topo.describe() # estadísticas descriptivas

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Node_id                        158 non-null    object 
 1   DEG                            158 non-null    object 
 2   DC                             158 non-null    float64
 3   BC                             158 non-null    float64
 4   CC                             158 non-null    float64
 5   EC                             158 non-null    float64
 6   CEN                            158 non-null    float64
 7   ECC                            158 non-null    float64
 8   RAD                            158 non-null    float64
 9   Target_group_score_normalized  158 non-null    float64
 10  Conectivity_Score              158 non-null    float64
dtypes: float64(9), object(2)
memory usage: 13.7+ KB


Unnamed: 0,DC,BC,CC,EC,CEN,ECC,RAD,Target_group_score_normalized,Conectivity_Score
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,0.142319,0.035101,0.71608,0.211671,0.678761,0.667722,0.836776,0.1,0.754879
std,0.130303,0.062428,0.069712,0.222216,0.190991,0.075939,0.055577,2.784383e-17,0.148067
min,0.000935,0.0,0.50164,0.000163,0.162162,0.5,0.604327,0.1,0.395446
25%,0.03391,0.00504,0.670897,0.02005,0.552987,0.6,0.804626,0.1,0.653999
50%,0.101029,0.013558,0.715885,0.188236,0.678758,0.6,0.841935,0.1,0.762254
75%,0.207671,0.038935,0.75549,0.32154,0.818279,0.75,0.8711,0.1,0.873683
max,0.651076,0.4075,0.924802,0.970025,1.0,0.75,0.967615,0.1,1.237862


In [28]:
# valores faltante por columnas
topo.isnull().sum()

Node_id                          0
DEG                              0
DC                               0
BC                               0
CC                               0
EC                               0
CEN                              0
ECC                              0
RAD                              0
Target_group_score_normalized    0
Conectivity_Score                0
dtype: int64

In [29]:
# Unir topo_clean con target_clean según node_id
topo_con_target = pd.merge(topo, target_clean[['Node_id', 'Target_group']], on='Node_id', how='left')

# Contar cuántos node_id de topo_clean pertenecen a cada target_group
conteo = topo_con_target['Target_group'].value_counts(dropna=False)

# Mostrar resultados
print(conteo)

Target_group
T4    158
Name: count, dtype: int64


In [32]:
processed_dir = os.path.join("..","data","processed","raw_cleaned")

# Filtrar para mantener solo las proteínas presentes en target_clean
topo_clean = topo[topo['Node_id'].isin(target_clean['Node_id'])].copy()

#Cambiar nombre de Conectivity_Score a Conectivity_score_topo
topo_clean.rename(columns={"Conectivity_Score": "Conectivity_score_topo"}, inplace=True)

# Guardar la versión limpia (mismo nombre que la variable)
topo_clean.to_csv(os.path.join(processed_dir, "topo_clean.csv"), index=False)

# Verificación
print("Archivo guardado en:", os.path.join(processed_dir, "topo_clean.csv"))
print("Total de datos topológicos:", len(topo_clean))


Archivo guardado en: ..\data\processed\raw_cleaned\topo_clean.csv
Total de datos topológicos: 158


In [33]:
topo_clean.head()

Unnamed: 0,Node_id,DEG,DC,BC,CC,EC,CEN,ECC,RAD,Target_group_score_normalized,Conectivity_score_topo
0,AGT,Up,0.488307,0.359142,0.891952,0.548342,0.241584,0.75,0.951754,0.1,0.92577
1,CPX-8674,Down,0.238541,0.012837,0.742585,0.357214,0.565671,0.75,0.861938,0.1,0.918033
2,CPX-8675,Down,0.22638,0.011948,0.736731,0.341825,0.578947,0.75,0.857677,0.1,0.941751
3,APOE,Up,0.532273,0.38461,0.902188,0.58046,0.216216,0.75,0.95682,0.1,0.908044
4,ALDH7A1,Up,0.179607,0.080833,0.767238,0.256238,0.497392,0.75,0.879172,0.1,0.832724


In [34]:
# Obtener Node_id que no fueron considerados
no_consideradas = topo[~topo['Node_id'].isin(target_clean['Node_id'])]

# Mostrar resultados
print(no_consideradas.shape[0])  # cuántas son
print(no_consideradas['Node_id'].tolist())  # listado de proteínas

0
[]


### Gráficas para Topo

In [41]:
# Cargar los datos limpios
import os
import pandas as pd
data_dir = os.path.join("..","data","processed","raw_cleaned")
topo_c = pd.read_csv(os.path.join(data_dir, "topo_clean.csv"))
target_c = pd.read_csv(os.path.join(data_dir, "target_clean.csv"))

In [43]:
# Distribución Prediction
topo_c['DEG'].value_counts()

DEG
Down    91
Up      67
Name: count, dtype: int64

In [44]:
# Distribución de target_type
merged = pd.merge(topo_c, target_c[['Node_id', 'Target_type']], on='Node_id', how='inner')
dist = merged['Target_type'].value_counts().sort_values(ascending=False)
print("Distribución de Target_type:")
print(dist)


Distribución de Target_type:
Target_type
SINGLE PROTEIN     80
PROTEIN COMPLEX    78
Name: count, dtype: int64


In [45]:
# Histograma de cada métrica topológica
fig_dir = os.path.join("..","results","figures")

metrics = ["DC", "BC", "CC", "EC", "CEN", "ECC", "RAD", "Conectivity_score_topo"]
for metric in metrics:
    plt.figure(figsize=(6, 4))
    sns.histplot(topo_c[metric], bins=30, kde=True)
    plt.title(f"Distribución de {metric}")
    plt.tight_layout()
    plt.savefig(os.path.join(fig_dir, f"topo_{metric}_distribution.png"))
    plt.close()

In [46]:
# Heatmap de correlación de las métricas topológicas
fig_dir = os.path.join("..","results","figures")

plt.figure(figsize=(8, 6))
sns.heatmap(topo_c[metrics].corr(), annot=True, fmt=".2f", cmap="viridis")
plt.title("Correlación entre métricas topológicas")
plt.tight_layout()
plt.savefig(os.path.join(fig_dir, "topo_correlation_heatmap.png"))
plt.close()


## Datos de enriquecimiento

In [47]:
# obtenemos la infromación general del dataset de enrichment
enrichment_info = {sheet: df.info() for sheet, df in enrichment.items()}

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5352 entries, 0 to 5351
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Rank                  5352 non-null   int64  
 1   Term Name             5352 non-null   object 
 2   P-value               5352 non-null   float64
 3   Z-score               5352 non-null   float64
 4   Combined_Score        5352 non-null   float64
 5   Overlapping genes     5352 non-null   object 
 6   Adjusted p-value      5352 non-null   float64
 7   Old p-value           5352 non-null   int64  
 8   Old Adjusted p-value  5352 non-null   int64  
dtypes: float64(4), int64(3), object(2)
memory usage: 376.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1109 entries, 0 to 1108
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Rank                  1109 non-null   int64  
 1   Term 

BP tiene solo 5352 entradas, MF 1109 y CC 467. Lo que da un total de 6.928 Ontologías.

In [48]:
for sheet_name, df in enrichment.items():
    print(f"\nDescripción estadística para: {sheet_name}")
    print(df.describe())



Descripción estadística para: Biological process
              Rank        P-value        Z-score  Combined_Score  \
count  5352.000000   5.352000e+03    5352.000000    5.352000e+03   
mean   2676.500000   1.874399e-01    3170.913153    4.271678e+04   
std    1545.133651   2.734338e-01   22435.668584    6.919208e+05   
min       1.000000  2.196107e-120       0.014563    7.158492e-08   
25%    1338.750000   1.163665e-03       1.545482    2.272210e+00   
50%    2676.500000   4.158506e-02       2.625268    9.755728e+00   
75%    4014.250000   2.727592e-01       4.182365    3.341058e+01   
max    5352.000000   9.999954e-01  713592.000000    4.240141e+07   

       Adjusted p-value  Old p-value  Old Adjusted p-value  
count      5.352000e+03       5352.0                5352.0  
mean       2.255008e-01          0.0                   0.0  
std        2.906253e-01          0.0                   0.0  
min       1.175357e-116          0.0                   0.0  
25%        4.609869e-03         

El P-value y Adjusted p-value indican la significancia estadistica de cada témino GO. Mientras más bajo, más relevante es el GO.
El Combined_Score combina Z-score y p-value para el ranking.

In [49]:
# valores faltante por columnas
for sheet_name, df in enrichment.items():
    print(f"\n Valores nulos en: {sheet_name}")
    print(df.isnull().sum())


 Valores nulos en: Biological process
Rank                    0
Term Name               0
P-value                 0
Z-score                 0
Combined_Score          0
Overlapping genes       0
Adjusted p-value        0
Old p-value             0
Old Adjusted p-value    0
dtype: int64

 Valores nulos en: Molecular function
Rank                    0
Term Name               0
P-value                 0
Z-score                 0
Combined_Score          0
Overlapping genes       0
Adjusted p-value        0
Old p-value             0
Old Adjusted p-value    0
dtype: int64

 Valores nulos en: Cellular component
Rank                    0
Term Name               0
P-value                 0
Z-score                 0
Combined_Score          0
Overlapping genes       0
Adjusted p-value        0
Old p-value             0
Old Adjusted p-value    0
dtype: int64


Las columnas old p-value y old adjusted p-value están completamente llenas de ceros, por lo que no aportan información útil. Se eliminarán de los 3 DataFrame. 

In [1]:
# Hacer un unico dataframe con los términos GO de todas las hojas del archivo de enriquecimiento
import pandas as pd
import os

# Cargar datos
data_dir = os.path.join("..","data","raw")
enrichment = pd.read_excel(os.path.join(data_dir, "enrichment2.xlsx"), sheet_name=None)  # Todas las hojas


# Lista para almacenar dataframes procesados
go_clean_list = []

# Procesar cada hoja
for sheet, df in enrichment.items():
    df = df.copy()

    # Eliminar columnas antiguas si existen
    df.drop(columns=["Old p-value", "Old Adjusted p-value"], inplace=True, errors="ignore")

    # Separar "Term Name" en nombre y GO ID
    df[["Term_Name_Clean", "GO_ID"]] = df["Term Name"].str.extract(r"^(.*?)\s*\((GO:\d+)\)")

    # Usar GO_ID sin sufijo
    df["GO"] = df["GO_ID"]

    # Agregar columna con tipo de ontología (opcional)
    df["Ontology"] = sheet  # Si igual quieres saber si era BP/MF/CC

    # Seleccionar columnas relevantes
    df_final = df[["Term_Name_Clean", "GO", "P-value", "Z-score", "Combined_Score", "Overlapping genes", "Adjusted p-value", "Ontology"]]
    go_clean_list.append(df_final)

# Unir todos los dataframes en uno solo
go_merged = pd.concat(go_clean_list, ignore_index=True)




  warn("Workbook contains no default style, apply openpyxl's default")


In [2]:
go_merged.head()

Unnamed: 0,Term_Name_Clean,GO,P-value,Z-score,Combined_Score,Overlapping genes,Adjusted p-value,Ontology
0,Protein Phosphorylation,GO:0006468,2.1961070000000002e-120,10.371111,2857.484956,"ATF2, MAML1, TESK2, LIPE, TBK1, AKT2, PRKACG, ...",1.175357e-116,Biological process
1,Phosphorylation,GO:0016310,2.02375e-88,8.128998,1641.427871,"EPHB6, PANK2, TP53RK, MAML1, TESK2, RPS6KA4, L...",5.415554999999999e-85,Biological process
2,Positive Regulation Of DNA-templated Transcrip...,GO:0045893,1.149844e-78,3.04153,545.839131,"ATF1, ATF2, SPI1, MAML1, TRRAP, HNRNPU, GPATCH...",2.051321e-75,Biological process
3,Positive Regulation Of Nucleic Acid-Templated ...,GO:1903508,8.127226e-68,4.633815,715.834306,"TRRAP, GPATCH3, SOX2, SOX17, KAT5, SOX18, ZMIZ...",1.0874229999999999e-64,Biological process
4,Protein Modification Process,GO:0036211,2.9270069999999997e-63,3.670553,528.518884,"MAML1, ARAF, TESK2, DCAF1, LIPE, TBK1, AKT2, A...",3.133068e-60,Biological process


In [3]:
# Guardar como CSV
processed_dir = os.path.join("..","data","processed","raw_cleaned")
go_clean_path = os.path.join(processed_dir, "go_enrichment_clean.csv")
go_merged.to_csv(go_clean_path, index=False)

# Confirmar guardado
print("Archivo guardado en:", go_clean_path)
print("Total de terminos GO:", len(go_merged))


Archivo guardado en: ..\data\processed\raw_cleaned\go_enrichment_clean.csv
Total de terminos GO: 6928


### Gráficas para Enriquecimiento

In [4]:
# Cargar los datos limpios
data_dir = os.path.join("..","data","processed","raw_cleaned")
go_c = pd.read_csv(os.path.join(data_dir, "go_enrichment_clean.csv"))
fig_dir = os.path.join("..","results","figures")

In [5]:
# Número de términos por ontología (BP, MF, CC)
plt.figure(figsize=(6, 4))
go_c["Ontology"].value_counts().plot(kind="bar", color="salmon")
plt.title("Cantidad de términos GO por Ontología")
plt.tight_layout()
plt.savefig(os.path.join(fig_dir, "go_terms_by_ontology.png"))
plt.close()

#BP tiene solo 5352 entradas, MF 1109 y CC 467.

NameError: name 'plt' is not defined

In [62]:
# Distribución de Adjusted p-value 
plt.figure(figsize=(6, 4))
sns.histplot(go_c["Adjusted p-value"], bins=50, log_scale=(False, True))
plt.title("Distribución de Adjusted p-value")
plt.xlabel("Adjusted p-value (log)")
plt.tight_layout()
plt.savefig(os.path.join(fig_dir, "adjusted_pvalue_distribution.png"))
plt.close()


In [63]:
# Wordcloud con los Termn_Name_Clean
text = " ".join(go_c["Term_Name_Clean"].dropna())
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud de Términos GO")
plt.tight_layout()
plt.savefig(os.path.join(fig_dir, "go_wordcloud.png"))
plt.close()

## Revisar coincidencias

In [64]:


data_dir = os.path.join("..","data","processed","raw_cleaned")
ppi_clean = pd.read_csv(os.path.join(data_dir, "ppi_clean.csv"))
target_clean = pd.read_csv(os.path.join(data_dir, "target_clean.csv"))

In [65]:
# Proteínas únicas de ambas columnas
proteinas_ppi = pd.unique(ppi_clean[['Prot_A', 'Prot_B']].values.ravel())
print(f"Número de proteínas únicas en PPI: {len(proteinas_ppi)}")

Número de proteínas únicas en PPI: 5390


In [66]:
# Proteínas que están también en target_clean
proteinas_en_target = set(proteinas_ppi).intersection(set(target_clean['Node_id']))
print(f"Número de proteínas en PPI que están también en target_clean: {len(proteinas_en_target)}")

Número de proteínas en PPI que están también en target_clean: 5390


In [67]:
print(f"Total de proteínas únicas en PPI: {len(proteinas_ppi)}")
print(f"Total de proteínas en PPI que están en Target_clean: {len(proteinas_en_target)}")
print("Ejemplo de coincidencias:", list(proteinas_en_target)[:10])


Total de proteínas únicas en PPI: 5390
Total de proteínas en PPI que están en Target_clean: 5390
Ejemplo de coincidencias: ['KLF11', 'CPX-2319', 'CPX-2204', 'LYZ', 'MYO5A', 'BRSK2', 'CPX-648', 'GPX4', 'TERB1', 'CD59']
