In [47]:
import numpy as np
import pandas as pd
import networkx as nx
import itertools
import matplotlib.pyplot as plt
import os

# Importar librerias para análisis de la red
import random

# Community Detection Algorithms
from networkx.algorithms import community
from matplotlib.cm import ScalarMappable

# Crear la carpeta si no existe
output_folder_plots = 'plots'
if not os.path.exists(output_folder_plots):
    os.makedirs(output_folder_plots)
    
# Crear la carpeta si no existe
output_data_folder = 'output_data'
if not os.path.exists(output_data_folder):
    os.makedirs(output_data_folder)

#load ingredients and compounds information
# Basically we have two tables (compounds and ingredients) connected by a relation (ingredient_compound).
compounds_file = pd.read_csv('data/compounds.tsv',index_col=0,sep='\t').to_dict('index')
ingredients_file = pd.read_csv('data/ingredients.tsv',index_col=0,sep='\t').to_dict('index')
ingredient_compound_file = pd.read_csv('data/ingredient-compound.tsv',sep='\t')

#Load the recipes:
## Each row corresponds with a recipe
## First column is the regional cousine
## The other columns are ingredients of the recipe
## Although this file is a csv, we load it as if it were separated by tabs, because the number of columns is not consistent.
recipes_file = pd.read_csv('data/recipes.csv',skiprows=3,sep='\t')

In [2]:
# Add this function for visualization
def visualize_graph(G, node_labels=None, node_size=50, font_size=8):
    pos = nx.spring_layout(G, seed=42)
    nx.draw(G, pos, with_labels=False, node_size=node_size, edge_color="gray", linewidths=0.3)
    if node_labels:
        nx.draw_networkx_labels(G, pos, labels=node_labels, font_size=font_size)
    plt.show()

In [3]:
## Process functions
### The ingredient_compound file is a table whose values are the ids of ingredients and compounds, we just process it to use the names
def getBipartiteIngredientCompoundNetworkDF(ingredient_compound_file):
  new_df=pd.DataFrame()
  new_df['Ingredient']=ingredient_compound_file['# ingredient id'].apply(lambda x: ingredients_file[x]['ingredient name'])
  new_df['Compound']=ingredient_compound_file['compound id'].apply(lambda x: compounds_file[x]['Compound name'])
  return new_df

### The recipes_file is processed to obtain a dataFrame whose columns are the cousine (European, Latin, ..) and the ingredients list of the recipe
def processRecipesFile(recipes_file):
  df = recipes_file.copy()
  df.columns=['cousine']
  df['ingredients'] = df['cousine'].apply(lambda x: x.split(',')[1:])
  df['cousine'] = df['cousine'].apply(lambda x: x.split(',')[0])
  return df

### Function to get the ingredients set of a Recipes DataFrame. It is valuable, because we can process just one recipe or a subset of recipes and get the set of ingredients contained.
def getIngredientsSet(recipes):
  ingredients_set = set()
  recipes.ingredients.map(lambda x: [ingredients_set.add(i) for i in x])
  return ingredients_set

### Function to get the pairs of ingredients needed to build the Number of Shared Components by two ingredients 
def getPairsOfIngredients(df):
  pairs_df = pd.DataFrame([ list(pair) for pair in list(itertools.combinations(list(df['Ingredient']), 2))], columns=['Ingredient1ID','Ingredient2ID'])
  return pairs_df

def getPrevalenceOfIngredients(recipes):
  # count the number of recipes that contain each ingredient
  ingredient_counts = {}
  for ingredients in recipes['ingredients']:
    for ingredient in ingredients:
      if ingredient in ingredient_counts:
        ingredient_counts[ingredient] += 1
      else:
        ingredient_counts[ingredient] = 1

  # calculate the prevalence of each ingredient
  total_recipes = len(recipes)
  ingredient_prevalence = {ingredient: count/total_recipes for ingredient, count in ingredient_counts.items()}
  
  return ingredient_prevalence


In [4]:
### Function to build the Dataset whose columns are Ingredient1ID, Ingredient2ID and the Number of Shared Components by two ingredients 
def getFlavorNetworkDF(ingredient_compound_file, ingredients_file):
  aux=ingredient_compound_file['# ingredient id'].unique()
  aux.sort()
  new_df=getPairsOfIngredients(pd.DataFrame(aux,columns=['Ingredient']))
  compound_groups=dict(ingredient_compound_file.groupby('# ingredient id')['compound id'].apply(set))
  new_df['NumSharedComponents']=new_df.apply(lambda row: len(compound_groups[row['Ingredient1ID']].intersection(compound_groups[row['Ingredient2ID']])),axis=1)
  #Now we would like to add the category of the first Ingredient and the category of the Second Ingredient
  new_df2=new_df[new_df['NumSharedComponents']>0].copy()
  new_df2['Ingredient1']=new_df2['Ingredient1ID'].apply(lambda x: ingredients_file[x]['ingredient name'])
  new_df2['Ingredient2']=new_df2['Ingredient2ID'].apply(lambda x: ingredients_file[x]['ingredient name'])
  return new_df2

def filterIngredientsFromRecipes(df, ingredients_set):
  filtered_df = df[df['Ingredient1'].isin(ingredients_set)].copy()
  filtered_df = filtered_df[filtered_df['Ingredient2'].isin(ingredients_set)]
  return filtered_df

In [5]:
def extractBackbone(G, alpha, weight_attribute='weight'):
  # Create a new graph to hold the backbone
  backbone = nx.Graph()
  # Iterate over all nodes in the original graph
  for n in G.nodes:
    # Get the neighbors of the current node
    neighbors = list(G.neighbors(n))
    
    if len(neighbors) > 1:
      # Calculate the sum of the weights of the edges between the current node and its neighbors
      sum_w = sum(G[n][nj][weight_attribute] for nj in neighbors)
      # Iterate over each neighbor and check if the edge is significant
      for nj in neighbors:
        # Calculate the probability of observing the edge between the current node and its neighbor
        pij = G[n][nj][weight_attribute] / sum_w
        
        # Calculate the probability that at least one edge exists between the current node and its neighbors
        p = 1 - (1 - pij)**(len(neighbors) - 1)

        # If the edge is significant, add it to the backbone
        if p > 1-alpha:
          attrs = G[n][nj]
          backbone.add_edge(n,nj,**attrs)
  return backbone

In [6]:
def getNodesFromBackbone(G_df, ingredients_file, recipes):
  #Get the ingredients from Backbone
  ingredientIDs= set()
  G_df.Ingredient1ID.map(lambda x: ingredientIDs.add(x))
  G_df.Ingredient2ID.map(lambda x: ingredientIDs.add(x))

  ingredient_prevalence_dict=getPrevalenceOfIngredients(recipes)

  df = pd.DataFrame()
  for ingredientID in ingredientIDs:
    ingredient_name = ingredients_file[ingredientID]['ingredient name']
    new_row = pd.DataFrame({'ID': [ingredientID],
                            'Ingredient': [ingredient_name],
                            'Category': [ingredients_file[ingredientID]['category']],
                            'Prevalence': [ingredient_prevalence_dict[ingredient_name]]})
    df = pd.concat([df, new_row], ignore_index=True)
  return df

In [7]:
#bipartite_ingredient_compound_network_df = getBipartiteIngredientCompoundNetworkDF(ingredient_compound_file)
#This is done alone because it can take many time
full_flavor_network_df = getFlavorNetworkDF(ingredient_compound_file, ingredients_file)

In [8]:
recipes = processRecipesFile(recipes_file)
ingredients_from_recipes = getIngredientsSet(recipes)
recipes_flavor_network_df=filterIngredientsFromRecipes(full_flavor_network_df,ingredients_from_recipes)

In [9]:
G=nx.from_pandas_edgelist(recipes_flavor_network_df, source='Ingredient1', target='Ingredient2', edge_attr=True)
G_backbone=extractBackbone(G,alpha=0.04,weight_attribute='NumSharedComponents')

G_backbone_df=nx.to_pandas_edgelist(G_backbone)
nodes=getNodesFromBackbone(G_backbone_df,ingredients_file,recipes)

## Export to Gephi: Load separately edges.csv and nodes.csv
edges=G_backbone_df[['Ingredient1ID','Ingredient2ID','NumSharedComponents']]
edges.columns=['Source','Target','Weight']
edges.to_csv('edges.csv',index=False)
nodes.to_csv('nodes.csv',index=False)

In [None]:
# Read CSV files
nodes_df = pd.read_csv("nodes.csv")
edges_df = pd.read_csv("edges.csv")

# Create an empty graph
G = nx.Graph()

# Add nodes to the graph
for index, row in nodes_df.iterrows():
    G.add_node(row['ID'], Ingredient=row['Ingredient'], Category=row['Category'], Prevalence=str(row['Prevalence']))

# Add edges to the graph
for index, row in edges_df.iterrows():
    G.add_edge(row['Source'], row['Target'], weight=str(row['Weight']))

# Save the graph as a GML file for visualization in Gephi or similar programs
nx.write_gml(G, "graph.gml")

In [None]:
# Function to get the pairs of nodes that have connection - Shortest path

def get_random_connected_pairs(G, num_pairs=3):
    connected_pairs = []
    nodes = list(G.nodes)
    
    while len(connected_pairs) < num_pairs:
        source, target = random.sample(nodes, 2)
        if nx.has_path(G, source, target):
            connected_pairs.append((source, target))
    
    return connected_pairs

connected_pairs = get_random_connected_pairs(G, num_pairs=3)
print("Randomly selected connected pairs:", connected_pairs)

# Pathfinding Algorithms
for source, target in connected_pairs:
    shortest_path = nx.shortest_path(G, source=source, target=target)
    print(f"Shortest path between {source} and {target}: {shortest_path}")

data = []

for source, target in connected_pairs:
    source_data = G.nodes[source]
    target_data = G.nodes[target]
    
    data.append({
        'source_id': source,
        'source_ingredient': source_data['Ingredient'],
        'source_category': source_data['Category'],
        'source_prevalence': source_data['Prevalence'],
        'target_id': target,
        'target_ingredient': target_data['Ingredient'],
        'target_category': target_data['Category'],
        'target_prevalence': target_data['Prevalence']
    })

df = pd.DataFrame(data)
print(df)


In [None]:
# Add library for remove images
import glob

# Function for remove .png files 
def remove_png_files(folder):
    png_files = glob.glob(os.path.join(folder, '*.png'))
    for png_file in png_files:
        os.remove(png_file)
        
# Function to get data and graph nodes - Shortest path       
def plot_and_save_shortest_path(G, source, target, output_folder):
    
    subgraph_nodes = nx.shortest_path(G, source=source, target=target)
    subgraph = G.subgraph(subgraph_nodes)

    plt.figure(figsize=(8, 6))
    pos = nx.spring_layout(subgraph)
    nx.draw(subgraph, pos, node_color="lightblue", node_size=300, with_labels=True, font_size=10)

    # Draw source and target nodes in red
    nx.draw_networkx_nodes(subgraph, pos, nodelist=[source, target], node_color='r', node_size=300)

    plt.axis('off')
    plt.title(f"Shortest Path between {source} and {target}")
    
    # Add title and legend
    
    plt.legend([
        plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=10, label='Path Nodes'),
        plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='blue', markersize=10, label='Links'),
        plt.Line2D([0], [0], color='red', lw=2, label='Source/Target Nodes'),
        plt.Line2D([0], [0], color='black', lw=2, label='Other Edges')
    ], labels=[
        'Path Nodes',
        'Links',
        'Source/Target Nodes',
        'Other Edges'
    ])

    plt.suptitle("Shortest Path from Node {} to Node {}".format(source, target))


    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    plt.savefig(os.path.join(output_folder, f"shortest_path_{source}_{target}.png"), format="png")
    plt.close()



# Deletes existing .png files in the output folder
output_folder = "output_folder_paths"
remove_png_files(output_folder)

for source, target in connected_pairs:
    plot_and_save_shortest_path(G, source, target, output_folder)