# **0.1 Preprocesamiento de los datos**

In [1]:
# Importamos librería 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import os

In [2]:
# Cargar datos
data_dir = r"C:\Users\Macarena Madrid\Desktop\\alzheimer-target-prediction\data\raw"

ppi = pd.read_csv(os.path.join(data_dir, "1_PPI-network_Alzheimer_Disease_no-opentarget-filter.csv"))
targets = pd.read_excel(os.path.join(data_dir, "3_Targets-score_Alzheimer_Disease_no-opentarget-filter.xlsx"))
topo = pd.read_csv(os.path.join(data_dir, "topology_158_up_down_normalized_splitPCA.csv"))
enrichment = pd.read_excel(os.path.join(data_dir, "enrichment2.xlsx"), sheet_name=None)  # Todas las hojas

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


## Datos de PPI

In [3]:
# obtenemos la infromación general del dataset de ppi
ppi.info()
ppi.describe() # estadísticas descriptivas

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426139 entries, 0 to 426138
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Prot_A             426139 non-null  object 
 1   Prot_B             426139 non-null  object 
 2   Interaction_score  426139 non-null  float64
 3   Disease            426139 non-null  object 
dtypes: float64(1), object(3)
memory usage: 13.0+ MB


Unnamed: 0,Interaction_score
count,426139.0
mean,0.621208
std,0.180884
min,0.4
25%,0.473
50%,0.574
75%,0.728
max,0.999


In [4]:
# Conteo
print("Nº total de interacciones:", len(ppi))
print("Nº proteínas únicas:", len(pd.unique(ppi[['Prot_A', 'Prot_B']].values.ravel())))
print("Nº duplicados:", ppi.duplicated().sum())
print("Nº self-loops:", (ppi['Prot_A'] == ppi['Prot_B']).sum())



Nº total de interacciones: 426139
Nº proteínas únicas: 5411
Nº duplicados: 0
Nº self-loops: 0


In [5]:
# valores faltante por columnas
ppi.isnull().sum()

Prot_A               0
Prot_B               0
Interaction_score    0
Disease              0
dtype: int64

In [8]:
print(len(ppi))

426139


No hay valores faltantes, pero si es necesario eliminar la columna Disease ya que sabemos que solo se trabajarán con datos de Alzheimer.

Interaction_score va de 0 a 1, y queremos ver o analizar las interacciones menos confiables, entonces se filtraran por <0.7 que corresponden a las interacciones de baja a mediana confianza, excluyendo las de alta o muy alta confianza (>= 0.7). *Pero se evaluará al final!!!*

In [10]:
processed_dir = r"C:\Users\Macarena Madrid\Desktop\\alzheimer-target-prediction\data\processed\raw_cleaned"

# Quedarse con interacciones con score < 0.7 
#ppi_filtered= ppi[ppi['Interaction_score'] < 0.7].copy()

# Eliminar columna 'Disease' del dataset de interacciones
ppi_clean = ppi.drop(columns=["Disease"])

# Guardar la versión limpia
ppi_clean.to_csv(os.path.join(processed_dir, "ppi_clean.csv"), index=False)

# Verificación
print("Archivo guardado en:", os.path.join(processed_dir, "ppi_clean.csv"))
print("Total de interacciones después de limpieza:", len(ppi_clean))

Archivo guardado en: C:\Users\Macarena Madrid\Desktop\\alzheimer-target-prediction\data\processed\raw_cleaned\ppi_clean.csv
Total de interacciones después de limpieza: 426139


Si se filtra con <0.7 me quedaría con 308736 de 426139 interacciones, y cambiarían los valores topológicos.

## Datos de target

In [11]:
# obtenemos la infromación general del dataset de targets
targets.info()
targets.describe() # estadísticas descriptivas

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5411 entries, 0 to 5410
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Target_name                    5411 non-null   object 
 1   Complex_participants           1494 non-null   object 
 2   Node_id                        5411 non-null   object 
 3   Uniprot_id                     5411 non-null   object 
 4   Target_type                    5411 non-null   object 
 5   Target_group                   5411 non-null   object 
 6   Source_db                      5411 non-null   object 
 7   Target_group_score             5411 non-null   float64
 8   Target_group_score_normalized  5411 non-null   float64
 9   Conectivity_Score              3842 non-null   float64
 10  Disease                        5411 non-null   object 
dtypes: float64(3), object(8)
memory usage: 465.1+ KB


Unnamed: 0,Target_group_score,Target_group_score_normalized,Conectivity_Score
count,5411.0,5411.0,3842.0
mean,1.630882,0.186112,0.730901
std,2.57031,0.14458,0.207526
min,0.1,0.1,0.34932
25%,0.1,0.1,0.605366
50%,0.1,0.1,0.715035
75%,5.0,0.375625,0.828582
max,16.1,1.0,2.627325


In [12]:
# valores faltante por columnas
targets.isnull().sum()

Target_name                         0
Complex_participants             3917
Node_id                             0
Uniprot_id                          0
Target_type                         0
Target_group                        0
Source_db                           0
Target_group_score                  0
Target_group_score_normalized       0
Conectivity_Score                1569
Disease                             0
dtype: int64

Complex_participants me da 3917 valores nulos, ya que no todos los blancos son complejos proteicos.
Conectivity_Score tiene 1569 valores nulos, se considerarán como una conectividad mínima o no disponible.
Se rellenarán con 0 ambos datos nulos.


In [13]:
# Conteo de targets en el DataFrame de targets
num_targets = targets['Node_id'].nunique()
print(f"Número de targets en el DataFrame de targets: {num_targets}")

Número de targets en el DataFrame de targets: 5411


In [14]:
# Obtenemos la cantidad de valores por clase de la variable "Target_group"

targets.Target_group.value_counts() 

Target_group
T4                3842
T2                1090
T2, T3             198
T2, T4             198
T2, T3, T4          29
T1, T2, T3          22
T3                  15
T1, T2               9
T1, T2, T3, T4       7
T3, T4               1
Name: count, dtype: int64

In [15]:
# Obtenemos la cantidad de valores por clase sin combinaciones de la variable "Target_group"
from collections import Counter

# Contar ocurrencias individuales en combinaciones
conteo = Counter()
targets["Target_group"].dropna().astype(str).str.split(",").apply(lambda grupos: conteo.update([g.strip() for g in grupos]))

for t in ["T1", "T2", "T3", "T4"]:
    print(f"{t}: {conteo[t]}")


T1: 38
T2: 1553
T3: 272
T4: 4077


El total corresponde a 5411 datos, donde el 61.7% equivale al T4, 23.5% a T3, 4.1% a T3 y 0.8% a T1. Lo que refleja que existe un desvalance de clases. Este sesgo puede afectar el rendimiento de los modelos supervisados.

In [16]:
# Tipos de target
targets['Target_type'].value_counts()

Target_type
SINGLE PROTEIN     3930
PROTEIN COMPLEX    1481
Name: count, dtype: int64

In [19]:
processed_dir = r"C:\Users\Macarena Madrid\Desktop\\alzheimer-target-prediction\data\processed\raw_cleaned"

# Rellenar valores nulos en el DataFrame original
targets["Complex_participants"] = targets["Complex_participants"].fillna("0")
targets["Conectivity_Score"] = targets["Conectivity_Score"].fillna(0)
targets["Target_type"] = targets["Target_type"].apply(lambda x: 1 if x == "SINGLE PROTEIN" else 0) 

# Eliminar columnas innecesarias
target_clean = targets.drop(columns=[
    "Target_group_score",        # ya tienes la versión normalizada
    "Disease",                   # no se usará
    "Source_db",                 # metadato no funcional
    "Uniprot_id"                 # info redundante, no como feature
])

# Verificar columnas finales
print("Columnas finales:", target_clean.columns.tolist())

# Guardar la versión limpia
target_clean.to_csv(os.path.join(processed_dir, "target_clean.csv"), index=False)

# Verificación
print("Archivo guardado en:", os.path.join(processed_dir, "target_clean.csv"))



Columnas finales: ['Target_name', 'Complex_participants', 'Node_id', 'Target_type', 'Target_group', 'Target_group_score_normalized', 'Conectivity_Score']
Archivo guardado en: C:\Users\Macarena Madrid\Desktop\\alzheimer-target-prediction\data\processed\raw_cleaned\target_clean.csv


In [52]:
target_clean.head()

Unnamed: 0,Target_name,Complex_participants,Node_id,Target_type,Target_group,Target_group_score_normalized,Conectivity_Score
0,glutamate nmda receptor; grin1/grin2a,"GRIN2D,GRIN1",CPX-289,0,"T1, T2, T3, T4",1.0,0.0
1,dna replication factor cdt1 (double parked hom...,"CDT1,GMNN",CPX-659,0,"T1, T2, T3, T4",1.0,0.0
2,geminin,"MCIDAS,GMNN",CPX-661,0,"T1, T2, T3, T4",1.0,0.0
3,nuclear receptor coactivator 2 (ncoa-2) (class...,"NCOA2,PPARG",CPX-702,0,"T1, T2, T3, T4",1.0,0.0
4,nuclear receptor coactivator 1 (ncoa-1) (ec 2....,"NCOA1,PPARG",CPX-711,0,"T1, T2, T3, T4",1.0,0.0


### Gráficas para Targets

In [59]:
# Cargar los datos limpios
data_dir = r"C:\Users\Macarena Madrid\Desktop\\alzheimer-target-prediction\data\processed\raw_cleaned"
targets_c = pd.read_csv(os.path.join(data_dir, "target_clean.csv"))

In [60]:
# Distribución de clases Target_group (frecuencia y procentaje)

fig_dir = r"C:\Users\Macarena Madrid\Desktop\\alzheimer-target-prediction\results\figures"

plt.figure(figsize=(8, 5))
targets_c["Target_group"].value_counts().plot(kind="bar", color="skyblue")
plt.title("Distribución de clases en Target_group")
plt.ylabel("Frecuencia")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(fig_dir, "target_group_distribution.png"))
plt.close()

In [62]:
# Distribución del Conectivity_Score
fig_dir = r"C:\Users\Macarena Madrid\Desktop\\alzheimer-target-prediction\results\figures"

plt.figure(figsize=(6, 4))
sns.histplot(targets_c["Conectivity_Score"], bins=30, kde=True)
plt.title("Distribución de Conectivity_Score")
plt.xlabel("Conectivity Score")
plt.tight_layout()
plt.savefig(os.path.join(fig_dir, "conectivity_score_distribution.png"))
plt.close()

## Datos de topología

In [20]:
# obtenemos la infromación general del dataset de topologia
topo.info()
topo.describe() # estadísticas descriptivas

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Node_id            158 non-null    object 
 1   prediction         158 non-null    object 
 2   DC                 158 non-null    float64
 3   BC                 158 non-null    float64
 4   CC                 158 non-null    float64
 5   EC                 158 non-null    float64
 6   CEN                158 non-null    float64
 7   ECC                158 non-null    float64
 8   RAD                158 non-null    float64
 9   Conectivity_Score  158 non-null    float64
dtypes: float64(8), object(2)
memory usage: 12.5+ KB


Unnamed: 0,DC,BC,CC,EC,CEN,ECC,RAD,Conectivity_Score
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,0.230279,0.100278,0.495669,0.225283,0.592059,0.556962,0.608665,0.426127
std,0.212538,0.169508,0.187579,0.236011,0.245694,0.432644,0.183501,0.195098
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.053674,0.016085,0.377904,0.020636,0.422149,0.0,0.51386,0.291396
50%,0.168254,0.043192,0.503034,0.193621,0.61064,0.4,0.628032,0.424866
75%,0.348016,0.105659,0.60609,0.337291,0.783107,1.0,0.722278,0.565477
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
# valores faltante por columnas
topo.isnull().sum()

Node_id              0
prediction           0
DC                   0
BC                   0
CC                   0
EC                   0
CEN                  0
ECC                  0
RAD                  0
Conectivity_Score    0
dtype: int64

In [50]:
processed_dir = r"C:\Users\Macarena Madrid\Desktop\alzheimer-target-prediction\data\processed\raw_cleaned"

# Filtrar para mantener solo las proteínas presentes en target_clean
topo_clean = topo[topo['Node_id'].isin(target_clean['Node_id'])].copy()

#Cambiar nombre de Conectivity_Score a Conectivity_score_topo
topo_clean.rename(columns={"Conectivity_Score": "Conectivity_score_topo"}, inplace=True)

# Guardar la versión limpia (mismo nombre que la variable)
topo_clean.to_csv(os.path.join(processed_dir, "topo_clean.csv"), index=False)

# Verificación
print("Archivo guardado en:", os.path.join(processed_dir, "topo_clean.csv"))
print("Total de datos topológicos:", len(topo_clean))


Archivo guardado en: C:\Users\Macarena Madrid\Desktop\alzheimer-target-prediction\data\processed\raw_cleaned\topo_clean.csv
Total de datos topológicos: 157


In [51]:
topo_clean.head()

Unnamed: 0,Node_id,prediction,DC,BC,CC,EC,CEN,ECC,RAD,Conectivity_score_topo
0,AGT,Up,0.74964,0.881329,0.92237,0.619335,0.094793,1.0,0.95634,0.703706
1,APOE,Up,0.817266,0.943827,0.946559,0.655622,0.064516,1.0,0.970285,0.680186
2,ALDH7A1,Up,0.27482,0.198363,0.627651,0.289315,0.400113,1.0,0.756549,0.580241
3,GLUD1,Up,0.205755,0.14113,0.590557,0.068793,0.45897,1.0,0.726704,0.408576
4,EPHX1,Up,0.109353,0.054785,0.352921,0.026284,0.887097,0.4,0.501368,0.700345


### Gráficas para Topo

In [63]:
# Cargar los datos limpios
data_dir = r"C:\Users\Macarena Madrid\Desktop\\alzheimer-target-prediction\data\processed\raw_cleaned"
topo_c = pd.read_csv(os.path.join(data_dir, "topo_clean.csv"))

In [64]:
# Histograma de cada métrica topológica
fig_dir = r"C:\Users\Macarena Madrid\Desktop\\alzheimer-target-prediction\results\figures"

metrics = ["DC", "BC", "CC", "EC", "CEN", "ECC", "RAD", "Conectivity_score_topo"]
for metric in metrics:
    plt.figure(figsize=(6, 4))
    sns.histplot(topo_c[metric], bins=30, kde=True)
    plt.title(f"Distribución de {metric}")
    plt.tight_layout()
    plt.savefig(os.path.join(fig_dir, f"topo_{metric}_distribution.png"))
    plt.close()

In [65]:
# Heatmap de correlación de las métricas topológicas
fig_dir = r"C:\Users\Macarena Madrid\Desktop\\alzheimer-target-prediction\results\figures"

plt.figure(figsize=(8, 6))
sns.heatmap(topo_c[metrics].corr(), annot=True, fmt=".2f", cmap="viridis")
plt.title("Correlación entre métricas topológicas")
plt.tight_layout()
plt.savefig(os.path.join(fig_dir, "topo_correlation_heatmap.png"))
plt.close()


## Datos de enriquecimiento

In [24]:
# obtenemos la infromación general del dataset de enrichment
enrichment_info = {sheet: df.info() for sheet, df in enrichment.items()}

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5352 entries, 0 to 5351
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Rank                  5352 non-null   int64  
 1   Term Name             5352 non-null   object 
 2   P-value               5352 non-null   float64
 3   Z-score               5352 non-null   float64
 4   Combined_Score        5352 non-null   float64
 5   Overlapping genes     5352 non-null   object 
 6   Adjusted p-value      5352 non-null   float64
 7   Old p-value           5352 non-null   int64  
 8   Old Adjusted p-value  5352 non-null   int64  
dtypes: float64(4), int64(3), object(2)
memory usage: 376.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1109 entries, 0 to 1108
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Rank                  1109 non-null   int64  
 1   Term 

BP tiene solo 5352 entradas, MF 1109 y CC 467. Lo que da un total de 6.928 Ontologías.

In [30]:
for sheet_name, df in enrichment.items():
    print(f"\nDescripción estadística para: {sheet_name}")
    print(df.describe())



Descripción estadística para: Biological process
              Rank        P-value        Z-score  Combined_Score  \
count  5352.000000   5.352000e+03    5352.000000    5.352000e+03   
mean   2676.500000   1.874399e-01    3170.913153    4.271678e+04   
std    1545.133651   2.734338e-01   22435.668584    6.919208e+05   
min       1.000000  2.196107e-120       0.014563    7.158492e-08   
25%    1338.750000   1.163665e-03       1.545482    2.272210e+00   
50%    2676.500000   4.158506e-02       2.625268    9.755728e+00   
75%    4014.250000   2.727592e-01       4.182365    3.341058e+01   
max    5352.000000   9.999954e-01  713592.000000    4.240141e+07   

       Adjusted p-value  Old p-value  Old Adjusted p-value  
count      5.352000e+03       5352.0                5352.0  
mean       2.255008e-01          0.0                   0.0  
std        2.906253e-01          0.0                   0.0  
min       1.175357e-116          0.0                   0.0  
25%        4.609869e-03         

El P-value y Adjusted p-value indican la significancia estadistica de cada témino GO. Mientras más bajo, más relevante es el GO.
El Combined_Score combina Z-score y p-value para el ranking.

In [32]:
# valores faltante por columnas
for sheet_name, df in enrichment.items():
    print(f"\n Valores nulos en: {sheet_name}")
    print(df.isnull().sum())







 Valores nulos en: Biological process
Rank                    0
Term Name               0
P-value                 0
Z-score                 0
Combined_Score          0
Overlapping genes       0
Adjusted p-value        0
Old p-value             0
Old Adjusted p-value    0
dtype: int64

 Valores nulos en: Molecular function
Rank                    0
Term Name               0
P-value                 0
Z-score                 0
Combined_Score          0
Overlapping genes       0
Adjusted p-value        0
Old p-value             0
Old Adjusted p-value    0
dtype: int64

 Valores nulos en: Cellular component
Rank                    0
Term Name               0
P-value                 0
Z-score                 0
Combined_Score          0
Overlapping genes       0
Adjusted p-value        0
Old p-value             0
Old Adjusted p-value    0
dtype: int64


Las columnas old p-value y old adjusted p-value están completamente llenas de ceros, por lo que no aportan información útil. Se eliminarán de los 3 DataFrame. 

In [45]:
# Hacer un unico dataframe con los términos GO de todas las hojas del archivo de enriquecimiento
import pandas as pd
import os

# Ruta del archivo original
enrichment_file = "C:/Users/Macarena Madrid/Desktop/alzheimer-target-prediction/data/raw/enrichment2.xlsx"

# Leer todas las hojas del archivo de enriquecimiento
enrichment = pd.read_excel(enrichment_file, sheet_name=None)

# Diccionario para asignar sufijo según la ontología
suffix_map = {
    "Biological process": "BP",
    "Molecular function": "MF",
    "Cellular component": "CC"
}

# Lista para almacenar dataframes procesados
go_clean_list = []

# Procesar cada hoja
for sheet, df in enrichment.items():
    df = df.copy()

    # Eliminar columnas antiguas si existen
    df.drop(columns=["Old p-value", "Old Adjusted p-value"], inplace=True, errors="ignore")

    # Separar "Term Name" en nombre y GO ID
    df[["Term_Name_Clean", "GO_ID"]] = df["Term Name"].str.extract(r"^(.*?)\s*\((GO:\d+)\)")

    # Agregar sufijo a GO ID según la ontología
    df["GO"] = df["GO_ID"] + "_" + suffix_map[sheet]

    # Agregar columna con tipo de ontología (opcional)
    df["Ontology"] = suffix_map[sheet]

    # Seleccionar columnas relevantes
    df_final = df[["Rank","Term_Name_Clean", "GO", "P-value", "Z-score", "Combined_Score", "Overlapping genes", "Adjusted p-value", "Ontology"]]
    go_clean_list.append(df_final)

# Unir todos los dataframes en uno solo
go_merged = pd.concat(go_clean_list, ignore_index=True)




  warn("Workbook contains no default style, apply openpyxl's default")


In [46]:
go_merged.head()

Unnamed: 0,Rank,Term_Name_Clean,GO,P-value,Z-score,Combined_Score,Overlapping genes,Adjusted p-value,Ontology
0,1,Protein Phosphorylation,GO:0006468_BP,2.1961070000000002e-120,10.371111,2857.484956,"ATF2, MAML1, TESK2, LIPE, TBK1, AKT2, PRKACG, ...",1.175357e-116,BP
1,2,Phosphorylation,GO:0016310_BP,2.02375e-88,8.128998,1641.427871,"EPHB6, PANK2, TP53RK, MAML1, TESK2, RPS6KA4, L...",5.415554999999999e-85,BP
2,3,Positive Regulation Of DNA-templated Transcrip...,GO:0045893_BP,1.149844e-78,3.04153,545.839131,"ATF1, ATF2, SPI1, MAML1, TRRAP, HNRNPU, GPATCH...",2.051321e-75,BP
3,4,Positive Regulation Of Nucleic Acid-Templated ...,GO:1903508_BP,8.127226e-68,4.633815,715.834306,"TRRAP, GPATCH3, SOX2, SOX17, KAT5, SOX18, ZMIZ...",1.0874229999999999e-64,BP
4,5,Protein Modification Process,GO:0036211_BP,2.9270069999999997e-63,3.670553,528.518884,"MAML1, ARAF, TESK2, DCAF1, LIPE, TBK1, AKT2, A...",3.133068e-60,BP


In [47]:
# Guardar como CSV
processed_dir = "C:/Users/Macarena Madrid/Desktop/alzheimer-target-prediction/data/processed/raw_cleaned"
go_clean_path = os.path.join(processed_dir, "go_enrichment_clean.csv")
go_merged.to_csv(go_clean_path, index=False)

# Confirmar guardado
print("Archivo guardado en:", go_clean_path)
print("Total de terminos GO:", len(go_merged))


Archivo guardado en: C:/Users/Macarena Madrid/Desktop/alzheimer-target-prediction/data/processed/raw_cleaned\go_enrichment_clean.csv
Total de terminos GO: 6928


### Gráficas para Enriquecimiento

In [67]:
# Cargar los datos limpios
data_dir = r"C:\Users\Macarena Madrid\Desktop\\alzheimer-target-prediction\data\processed\raw_cleaned"
go_c = pd.read_csv(os.path.join(data_dir, "go_enrichment_clean.csv"))

In [None]:
# Número de términos por ontología (BP, MF, CC)
plt.figure(figsize=(6, 4))
go_c["Ontology"].value_counts().plot(kind="bar", color="salmon")
plt.title("Cantidad de términos GO por Ontología")
plt.tight_layout()
plt.savefig(os.path.join(fig_dir, "go_terms_by_ontology.png"))
plt.close()

#BP tiene solo 5352 entradas, MF 1109 y CC 467.

In [69]:
# Distribución de Adjusted p-value 
plt.figure(figsize=(6, 4))
sns.histplot(go_c["Adjusted p-value"], bins=50, log_scale=(False, True))
plt.title("Distribución de Adjusted p-value")
plt.xlabel("Adjusted p-value (log)")
plt.tight_layout()
plt.savefig(os.path.join(fig_dir, "adjusted_pvalue_distribution.png"))
plt.close()


In [70]:
# Wordcloud con los Termn_Name_Clean
text = " ".join(go_c["Term_Name_Clean"].dropna())
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud de Términos GO")
plt.tight_layout()
plt.savefig(os.path.join(fig_dir, "go_wordcloud.png"))
plt.close()