In [1]:
# !pip install graphdatascience

In [2]:
# Need to create user first

# Define connection details
uri = "bolt://localhost:7687"  # Replace with your Neo4j URI
username = "username"         # Replace with your Neo4j username
password = "password"      # Replace with your Neo4j password
db_name = "nutrition"          # Specify the database you want to work on

In [3]:
from graphdatascience import GraphDataScience
import pandas as pd

# Connecting with the Neo4j database using GDS library
# Need to add a user by using CREATE USER in Neo4j browser
gds = GraphDataScience(uri, auth=(username, password), database=db_name)

# Should do this one in the conf file:
# dbms.security.procedures.unrestricted=jwt.security.*,gds.*
# dbms.security.procedures.allowlist=apoc.coll.*,apoc.load.*,gds.*

# Check the installed GDS version on the server
print(gds.version())
assert gds.version()



2.7.0


### Formating Pandas display

In [4]:
# Set pandas display options to prevent truncation
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

### Defining the Pipeline

Here we are creating a pipeline with the name “pipe”. Each component that we add must be in the exact order that we want it to be inside the pipeline.

In [5]:
# When we run the cypher query using the GDS library, we get an output as a DataFrame. You can save it in a variable to inspect it further.

# Define the pipeline name
pipeline_name = 'pipe'

# Check if the pipeline already exists
existing_pipelines = gds.run_cypher("CALL gds.pipeline.list() YIELD pipelineName")

existing_pipelines

Unnamed: 0,pipelineName


In [6]:
# Check if the pipeline exists and drop it if it does
if pipeline_name in existing_pipelines['pipelineName'].values:
    try:
        gds.run_cypher("CALL gds.pipeline.drop('pipe')")
        print("Existing pipeline dropped.")
    except Exception as e:
        print(f"Error dropping existing pipeline: {e}")

# Create the pipeline
result = gds.run_cypher(f"CALL gds.beta.pipeline.linkPrediction.create('{pipeline_name}')")
print("Pipeline created successfully.")
result

Pipeline created successfully.


Unnamed: 0,name,nodePropertySteps,featureSteps,splitConfig,autoTuningConfig,parameterSpace
0,pipe,[],[],"{'testFraction': 0.1, 'validationFolds': 3, 'trainFraction': 0.1, 'negativeSamplingRatio': 1.0}",{'maxTrials': 10},"{'MultilayerPerceptron': [], 'RandomForest': [], 'LogisticRegression': []}"


### Adding node properties

In this code we adding the component which would calculate the node embeddings based on Fast Random Projection algorithm. The embedding vector would be of length 256, stored inside our projected graph under the property name “embedding”.

In [7]:
cypher_query = """
CALL gds.beta.pipeline.linkPrediction.addNodeProperty('pipe', 'fastRP', {
  mutateProperty: 'embedding',
  embeddingDimension: 256,
  randomSeed: 42
})
"""

try:
    result = gds.run_cypher(cypher_query)
except Exception as e:
    print(f"An error occurred while adding node property: {e}")
    print()

result

Unnamed: 0,name,nodePropertySteps,featureSteps,splitConfig,autoTuningConfig,parameterSpace
0,pipe,"[{'name': 'gds.fastRP.mutate', 'config': {'randomSeed': 42, 'mutateProperty': 'embedding', 'contextRelationshipTypes': [], 'embeddingDimension': 256, 'contextNodeLabels': []}}]",[],"{'testFraction': 0.1, 'validationFolds': 3, 'trainFraction': 0.1, 'negativeSamplingRatio': 1.0}",{'maxTrials': 10},"{'MultilayerPerceptron': [], 'RandomForest': [], 'LogisticRegression': []}"


### Concatenating the node properties
If we have any numerical property (like age, income, etc.) that we want to use alongside the node embeddings, we must define them here. We used HADAMARD transform here to concatenate the defined properties. Keep in mind that, as of now, GDS only supports numerical columns. If we want to use categorical columns, we should store the properties as a one-hot encoded vector and then add the name of that property here


In [8]:
cypher_query = """
CALL gds.beta.pipeline.linkPrediction.addFeature('pipe', 'hadamard', {
  nodeProperties: ['embedding']
}) YIELD featureSteps
"""

try:
    result = gds.run_cypher(cypher_query)
except Exception as e:
    print(f"An error occurred while adding feature: {e}")
    print()

result

Unnamed: 0,featureSteps
0,"[{'name': 'HADAMARD', 'config': {'nodeProperties': ['embedding']}}]"


### Configuring the pipeline split
Here I am splitting the data into training, testing splits to evaluate our model performance.


In [9]:
cypher_query = """
CALL gds.beta.pipeline.linkPrediction.configureSplit('pipe', {
  testFraction: 0.25,
  trainFraction: 0.6,
  validationFolds: 3
})
YIELD splitConfig
"""

try:
    result = gds.run_cypher(cypher_query)
except Exception as e:
    print(f"An error occurred while configuring split: {e}")
    print()

result

Unnamed: 0,splitConfig
0,"{'testFraction': 0.25, 'validationFolds': 3, 'trainFraction': 0.6, 'negativeSamplingRatio': 1.0}"


### Adding the Machine Learning model

As of now, during the time of writing this blog, GDS only supports Logistic Regression model which can be potentially used in production, so I am going to use the same model here. There are also Random Forest and MLP models available for us to use.

In [10]:
# Adding the model to pipeline
cypher_query = """
CALL gds.beta.pipeline.linkPrediction.addLogisticRegression('pipe')
YIELD parameterSpace
"""

try:
    result = gds.run_cypher(cypher_query)
except Exception as e:
    print(f"An error occurred while adding logistic regression model: {e}")
    print()

result

Unnamed: 0,parameterSpace
0,"{'MultilayerPerceptron': [], 'RandomForest': [], 'LogisticRegression': [{'minEpochs': 1, 'maxEpochs': 100, 'focusWeight': 0.0, 'patience': 1, 'tolerance': 0.001, 'learningRate': 0.001, 'batchSize': 100, 'penalty': 0.0, 'methodName': 'LogisticRegression', 'classWeights': []}]}"


In [11]:
# Adding hyperparameters to be used
cypher_query = """
CALL gds.beta.pipeline.linkPrediction.addLogisticRegression('pipe', {maxEpochs: 800, penalty: {range: [1e-4, 1e2]}})
YIELD parameterSpace
RETURN parameterSpace.LogisticRegression AS logisticRegressionSpace
"""

try:
    result = gds.run_cypher(cypher_query)
except Exception as e:
    print(f"An error occurred while adding logistic regression model with hyperparameters: {e}")
    print()

result

Unnamed: 0,logisticRegressionSpace
0,"[{'minEpochs': 1, 'maxEpochs': 100, 'focusWeight': 0.0, 'patience': 1, 'tolerance': 0.001, 'learningRate': 0.001, 'batchSize': 100, 'penalty': 0.0, 'methodName': 'LogisticRegression', 'classWeights': []}, {'minEpochs': 1, 'maxEpochs': 800, 'focusWeight': 0.0, 'patience': 1, 'tolerance': 0.001, 'learningRate': 0.001, 'batchSize': 100, 'penalty': {'range': [0.0001, 100.0]}, 'methodName': 'LogisticRegression', 'classWeights': []}]"


### Configuring Auto-Tuning

In order to find good models, the pipeline supports automatically tuning the parameters of the training algorithm. Optionally, the procedure described below can be used to configure the auto-tuning behavior. Otherwise, default auto-tuning configuration is used.

In [12]:
cypher_query = """
CALL gds.alpha.pipeline.linkPrediction.configureAutoTuning('pipe', {
  maxTrials: 3
}) YIELD autoTuningConfig
"""

try:
    result = gds.run_cypher(cypher_query)
except Exception as e:
    print(f"An error occurred while configuring auto-tuning: {e}")
    print()

result

Unnamed: 0,autoTuningConfig
0,{'maxTrials': 3}


### Projecting the Graph from Neo4j Database

Now this is a very important step. In order to use the data present in our database, we don’t need to directly connect to our database. We can just project the graph into memory and make changes (called as “mutate” in gds library) to the graph data. These changes won’t be reflected into the database unless we write them back, leaving us room to play around with.

It is important to define the orientation of the graph as “UNDIRECTED”, irrespective of whatever direction it may have in the original graph.

In [13]:
# The name of our projected graph is myGraph
# It is important to define the orientation of the graph as “UNDIRECTED”, irrespective of whatever direction it may have in the original graph
cypher_query = """
CALL gds.graph.project(
  'recommendationGraph', 
  {
    User: {
      label: 'User'
    },
    Ingredient: {
      label: 'Ingredient'
    }
  },
  {
    LIKES: {
      type: 'LIKES',
      orientation: 'UNDIRECTED'
    },
    DISLIKES: {
      type: 'DISLIKES',
      orientation: 'UNDIRECTED'
    }
  }
)
"""

try:
    result = gds.run_cypher(cypher_query)
except Exception as e:
    print(f"An error occurred while projecting the graph: {e}")
    print()

result

Unnamed: 0,nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
0,"{'User': {'label': 'User', 'properties': {}}, 'Ingredient': {'label': 'Ingredient', 'properties': {}}}","{'DISLIKES': {'aggregation': 'DEFAULT', 'orientation': 'UNDIRECTED', 'indexInverse': False, 'properties': {}, 'type': 'DISLIKES'}, 'LIKES': {'aggregation': 'DEFAULT', 'orientation': 'UNDIRECTED', 'indexInverse': False, 'properties': {}, 'type': 'LIKES'}}",recommendationGraph,3096,252,785


### Training the model for LIKES

Now we are ready to actually train a LinkPrediction model. We must make sure to specify the targetRelationshipType to instruct the model to train only using that type. With the graph recommendationGraph there are actually no other relationship types projected, but that is not always the case.

In [14]:
# Here, we first pass the name of our projected graph, then a DICTIONARY object
# with the following parameters
cypher_query = """
CALL gds.beta.pipeline.linkPrediction.train('recommendationGraph', {
  pipeline: 'pipe',
  modelName: 'lp-pipeline-model',
  metrics: ['AUCPR', 'OUT_OF_BAG_ERROR'],
  targetRelationshipType: 'LIKES',
  randomSeed: 42
}) YIELD modelInfo, modelSelectionStats
RETURN
  modelInfo.bestParameters AS winningModel,
  modelInfo.metrics.AUCPR.train.avg AS avgTrainScore,
  modelInfo.metrics.AUCPR.outerTrain AS outerTrainScore,
  modelInfo.metrics.AUCPR.test AS testScore,
  [cand IN modelSelectionStats.modelCandidates | cand.metrics.AUCPR.validation.avg] AS validationScores
"""

try:
    result = gds.run_cypher(cypher_query)
except Exception as e:
    print(f"An error occurred while training link prediction model: {e}")
    print()

result

Unnamed: 0,winningModel,avgTrainScore,outerTrainScore,testScore,validationScores
0,"{'minEpochs': 1, 'maxEpochs': 100, 'focusWeight': 0.0, 'patience': 1, 'tolerance': 0.001, 'learningRate': 0.001, 'batchSize': 100, 'penalty': 0.0, 'methodName': 'LogisticRegression', 'classWeights': []}",0.864892,0.864865,0.797619,"[0.8177643843676452, 0.8177643843676452, 0.8177643843676452, 0.6235694740042567]"


### Using the model for inferencing

Now, what we like to do for inferencing is to store (mutate) the predicted links back to my projected graph, and then using a simple query to only get me the new predicted links. In this way, we can train my model periodically, and store the results in the projected graph without the need to write anything back to the Database.

In [15]:
# threshold is the parameter used to filter out the predictions based on probability
# Only those values would be returned which have a probability > threshold

# topN would give us 24,000 predictions (as we defined our relationship as UNDIRECTED)

model = 'lp-pipeline-model'
graph = 'recommendationGraph'

def mutate(graph_name, **kwargs):
    config = dict(**kwargs)
    try:
        gds.run_cypher(
            """
            CALL gds.beta.pipeline.linkPrediction.predict.mutate($graph_name, $config)
            """,
            params={"graph_name": graph_name, "config": config}
        )
    except Exception as e:
        print(f"An error occurred while mutating the graph: {e}")
        print()

# Example call to the function
mutate(
    graph_name=graph,
    modelName=model,
    relationshipTypes=['LIKES'],
    mutateRelationshipType='LIKES_PREDICTED',
    topN=12000,
    threshold=0.4
)             

In [16]:
df = gds.run_cypher("""
CALL gds.graph.relationships.stream($graphName)
YIELD sourceNodeId, targetNodeId, relationshipType
WHERE relationshipType='LIKES_PREDICTED'
RETURN gds.util.asNode(sourceNodeId).id AS source_id, gds.util.asNode(sourceNodeId).name AS source_name,
       gds.util.asNode(targetNodeId).id AS target_id, gds.util.asNode(targetNodeId).name AS target_name,
       relationshipType
ORDER BY source_id ASC, target_id ASC
""", params={'graphName': graph})

df = df.dropna(subset=['source_id'])
df = df[df['target_id'].isna()]

df.shape

# Saving as csv here so that you can look at the predicted results
df.to_csv('predicted.csv', index=False)
df

Unnamed: 0,source_id,source_name,target_id,target_name,relationshipType
12,1.0,Ch,,fish,LIKES_PREDICTED
13,1.0,Ch,,flatbreads,LIKES_PREDICTED
14,1.0,Ch,,cereal,LIKES_PREDICTED
15,1.0,Ch,,lettuce,LIKES_PREDICTED
16,1.0,Ch,,Tia Maria,LIKES_PREDICTED
17,1.0,Ch,,shrimp,LIKES_PREDICTED
18,1.0,Ch,,mini chocolate egg,LIKES_PREDICTED
19,1.0,Ch,,guava,LIKES_PREDICTED
20,1.0,Ch,,pomelo,LIKES_PREDICTED
21,1.0,Ch,,octopus,LIKES_PREDICTED


### Suggesting nutrient foods

In [17]:
# Create a dictionary of predicted liked ingredients by user
predicted_likes_dict = df.groupby('source_id')['target_name'].apply(list).to_dict()
predicted_likes_dict

{1.0: ['fish',
  'flatbreads',
  'cereal',
  'lettuce',
  'Tia Maria',
  'shrimp',
  'mini chocolate egg',
  'guava',
  'pomelo',
  'octopus',
  'whey protein',
  'durian',
  'tomato',
  'chicken breast',
  'pineapple',
  'spinach',
  'strawberry',
  'salmon',
  'chocolate',
  'pork',
  'carrot',
  'cucumber',
  'peach',
  'chicken',
  'blueberry',
  'orange',
  'cherry',
  'kale',
  'mango',
  'salad',
  'banana',
  'apple',
  'steak',
  'passion fruit',
  'dried fruit',
  'crab',
  'tuna',
  'mustard',
  'grapefruit',
  'grape',
  'corn',
  'oats',
  'pea'],
 2.0: ['tomato',
  'egg',
  'pineapple',
  'spinach',
  'strawberry',
  'salmon',
  'chocolate',
  'carrot',
  'cucumber',
  'peach',
  'chicken',
  'blueberry',
  'orange',
  'cherry',
  'kale',
  'mango',
  'salad',
  'ice cream',
  'banana',
  'apple',
  'steak',
  'passion fruit',
  'crab',
  'tuna',
  'mustard',
  'grapefruit',
  'cake',
  'grape',
  'corn',
  'pea',
  'fish',
  'flatbreads',
  'lettuce',
  'Tia Maria',
  's

In [18]:
def get_user_recipes(user_id, predicted_liked_ingredients):
    cypher_query = """
    MATCH (u:User {id: $userId})
    OPTIONAL MATCH (u)-[:LIKES]->(liked:Ingredient)
    OPTIONAL MATCH (u)-[:DISLIKES]->(disliked:Ingredient)
    OPTIONAL MATCH (u)-[:ALLERGIC_TO]->(allergic:Ingredient)
    OPTIONAL MATCH (u)-[:HAS_DIET]->(d:DietType)
    WITH 
        u,  // Include u here so it is in scope
        COLLECT(liked.name) AS likedIngredients,
        COLLECT(disliked.name) AS dislikedIngredients,
        COLLECT(allergic.name) AS allergicIngredients,
        COLLECT(DISTINCT d.name) AS dietTypes,
        $predictedLikedIngredients AS predictedLikedIngredients

    // Combine actual and predicted liked ingredients
    WITH 
        u,
        likedIngredients + predictedLikedIngredients AS allLikedIngredients,
        dislikedIngredients,
        allergicIngredients,
        dietTypes

    MATCH (r:Recipe)-[:DIET_TYPE]->(rd:DietType)
    WHERE rd.name IN dietTypes  // Ensure the diet type is in the user's diet types
        AND none(i IN dislikedIngredients WHERE exists((r)-[:CONTAINS_INGREDIENT]->(:Ingredient {name: i})))
        AND none(i IN allergicIngredients WHERE exists((r)-[:CONTAINS_INGREDIENT]->(:Ingredient {name: i})))
    WITH 
        u,
        r, 
        rd,
        [(r)-[:CONTAINS_INGREDIENT]->(i:Ingredient) | i.name] AS ingredients,
        allLikedIngredients
    WITH
        u,
        r, 
        rd,
        ingredients,
        [i IN ingredients WHERE i IN allLikedIngredients] AS matchedLikedIngredients,
        size(ingredients) AS numTotalIngredients
    RETURN u.name AS userName,  // Return user name
           r.name AS recipe,
           r.description AS description,  // Ensure the recipe node has a description property
           r.nAddedSugar AS addedSugar,  // Ensure these properties exist on the recipe node
           r.nCarbohydrate AS carbohydrate,
           r.nFat AS fat,
           r.nKcal AS kcal,
           r.nProtein AS protein,
           r.nSalt AS salt,
           r.nSaturatedFat AS saturatedFat,
           rd.name AS recipeDietType,
           ingredients,
           size(matchedLikedIngredients) AS numLikedIngredients,
           numTotalIngredients
    ORDER BY numLikedIngredients DESC, numTotalIngredients ASC
    LIMIT 20
    """

    # Execute the query
    result = gds.run_cypher(cypher_query, params={'userId': user_id, 'predictedLikedIngredients': predicted_liked_ingredients})
    recipes = result.to_dict(orient='records')

    return recipes

def get_all_users():
    # Define the Cypher query to retrieve all user IDs
    cypher_query = """
    MATCH (u:User)
    RETURN u.id AS userId
    """
    
    result = gds.run_cypher(cypher_query)
    user_ids = result['userId'].tolist()

    return user_ids

# Main function to get recipes for all users
def get_recipes_for_all_users():
    user_ids = get_all_users()
    all_user_recipes = {}
    
    for user_id in user_ids:
        predicted_liked_ingredients = predicted_likes_dict.get(user_id, [])
        recipes = get_user_recipes(user_id, predicted_liked_ingredients)
        all_user_recipes[user_id] = recipes
    
    return all_user_recipes

# Assuming all_user_recipes is a dictionary where key is user_id and value is a list of recipes
def convert_to_dataframe(all_user_recipes):
    # Create a list to hold all rows of data
    data = []
    
    # Iterate over each user and their recipes
    for user_id, recipes in all_user_recipes.items():
        # Check if recipes is a DataFrame or a list of dictionaries
        if isinstance(recipes, pd.DataFrame):
            # Convert the recipes DataFrame to a list of dictionaries
            for _, row in recipes.iterrows():
                # Append each recipe and user_id as a new row in the data list
                data.append({
                    'user_id': user_id,
                    'user_name': row['userName'],
                    'recipe': row['recipe'],
                    'description': row['description'],
                    'added_sugar': row['addedSugar'],
                    'carbohydrate': row['carbohydrate'],
                    'fat': row['fat'],
                    'kcal': row['kcal'],
                    'protein': row['protein'],
                    'salt': row['salt'],
                    'saturated_fat': row['saturatedFat'],
                    'recipe_diet_type': row['recipeDietType'],
                    'ingredients': ', '.join(row['ingredients']),  # Convert list to comma-separated string
                    'num_liked_ingredients': row['numLikedIngredients'],
                    'num_total_ingredients': row['numTotalIngredients']
                })
        elif isinstance(recipes, list):
            # If recipes is a list of dictionaries
            for recipe in recipes:
                # Append each recipe and user_id as a new row in the data list
                data.append({
                    'user_id': user_id,
                    'user_name': recipe.get('userName', ''),
                    'recipe': recipe.get('recipe', ''),
                    'description': recipe.get('description', ''),
                    'added_sugar': recipe.get('addedSugar', ''),
                    'carbohydrate': recipe.get('carbohydrate', ''),
                    'fat': recipe.get('fat', ''),
                    'kcal': recipe.get('kcal', ''),
                    'protein': recipe.get('protein', ''),
                    'salt': recipe.get('salt', ''),
                    'saturated_fat': recipe.get('saturatedFat', ''),
                    'recipe_diet_type': recipe.get('recipeDietType', ''),
                    'ingredients': ', '.join(recipe.get('ingredients', [])),  # Convert list to comma-separated string
                    'num_liked_ingredients': recipe.get('numLikedIngredients', 0),
                    'num_total_ingredients': recipe.get('numTotalIngredients', 0)
                })
    
    # Create a DataFrame from the data list
    df = pd.DataFrame(data)
    
    return df

# Example usage
all_user_recipes = get_recipes_for_all_users()
df_result = convert_to_dataframe(all_user_recipes)

df_result.to_csv('predicted_knowledge_graph_based.csv', encoding='utf-8', index=False)
df_result

Unnamed: 0,user_id,user_name,recipe,description,added_sugar,carbohydrate,fat,kcal,protein,salt,saturated_fat,recipe_diet_type,ingredients,num_liked_ingredients,num_total_ingredients
0,1,Ch,Frozen fruit sticks with passion fruit & lime drizzle,"On a hot day you'll be glad of this fruity, frozen snack that is low-fat and a great source of vitamin C – perfect for kids",7g,7g,,31 calories,1g,0.01g,,Healthy,"pineapple, grape, passion fruit, mango, kiwi fruit, lime, strawberry, icing sugar, melon",5,9
1,1,Ch,Kale and avocado smoothie,"This healthy smoothie gets its vibrant green colour from avocado, cucumber, spinach and kale. Blitz with pineapple and coconut water.",14g,21g,16g,262 calories,8g,1.1g,4g,Healthy,"spinach, cucumber, coconut, avocado, pineapple, kale",4,6
2,1,Ch,Pick & mix noodle plate,Kids will love choosing what to eat first - and they might end up eating more than if the same meal was served up whole,6g,50g,12g,388 calories,24g,0.63g,2g,Healthy,"chicken, egg noodle, breadcrumb, plum, spring onion, cucumber, egg, carrot, olive oil",4,9
3,1,Ch,Pearled spelt salad with peas & gooseberries,"A bright, seasonal salad with nutritious grains, greens, tomatoes, fennel, carrot and a touch of sweet berry flavour",12g,27g,5g,185 calories,7g,0.3g,1g,Healthy,"pea, tomato, fennel bulb, gooseberry, salad, rapeseed oil, carrot, onion, spelt, celery",4,10
4,1,Ch,Sweet & sour chicken & veg,A trusty takeaway favourite just got superhealthy. Make ahead and freeze for Friday!,24g,30g,4g,230 calories,20g,0.26g,1g,Healthy,"baby corn, carrot, pineapple, tomato, red chilli, rice, chicken breast, green pepper, vegetable oil, tomato ketchup, onion",4,11
5,1,Ch,Chicken schnitzel with coleslaw,"Chicken breasts are tenderised, covered in breadcrumbs and fried until golden and crispy. A dinner that kids will love cooking and eating.",13g,26g,19g,430 calories,36g,1.1g,5g,Healthy,"lemon, apple, breadcrumb, natural yogurt, egg, flour, parmesan, spring onion, white cabbage, carrot, chicken breast, English mustard, vegetable oil",4,13
6,1,Ch,Baked peanut chicken with carrot & cucumber salad,"Reminiscent of a chicken satay, this easy spiced peanut butter-coated chicken with a light carrot, shallot and coriander salad is ready in just 30 mins",6g,8g,14g,321 calories,39g,0.5g,3g,Healthy,"egg, salad leaf, coriander, cucumber, chicken breast, banana shallot, carrot, peanut butter, smoked paprika, cider vinegar, ground cumin, rapeseed oil, garlic clove",4,13
7,1,Ch,Get up and go breakfast muffins,"With apple, blueberry, banana and seeds, this breakfast-on-the-go tastes great, is low in calories and uses honey instead of sugar",10.2g,22.7g,7.1g,179 calories,5.2g,0.6g,0.9g,Healthy,"apple, rapeseed oil, vanilla extract, banana, large egg, blueberry, bicarbonate of soda, wholemeal flour, cinnamon, baking powder, honey, yogurt, seed, oats",4,14
8,1,Ch,Baked eggs with spinach & tomato,"A rustic dish with a delicious combination of flavours and just four ingredients, try whipping it up for brunch",2g,3g,7g,114 calories,9g,0.43g,2g,Healthy,"egg, spinach, tomato, chilli flakes",3,4
9,1,Ch,Vitamin booster smoothie,"Up your vitamin quota with help from this bright and fresh smoothie. Orange, carrot, celery and mango pack a nutritious punch",24g,25g,,114 calories,3g,0.2g,,Healthy,"mango, orange, water, celery, carrot",3,5


In [33]:
import pandas as pd

# Define the replacements dictionary
replacements = {
    'ÃƒÂ©': 'é',
    'â€“': '–',
    'Â': '',
    'Ã¢â‚¬â„¢': '’',
    'Ã¢â€žÂ¢': '™',
    'Ã§': 'ç',
    'Ã¼': 'ü',
    'Ã±': 'ñ',
    'Ã¶': 'ö',
    'â€œ': '“',
    'â€': '”',
    'â€˜': '‘',
    'â€™': '’',
    'â€': '†',
}

# Read the CSV file (assuming utf-8 encoding)
df = pd.read_csv('predicted_knowledge_graph_based.csv', encoding='utf-8')

# Specify the columns you want to fix
columns_to_fix = ['recipe', 'description']

# Apply the replacements to the specified columns
for column in columns_to_fix:
    for old, new in replacements.items():
        df[column] = df[column].str.replace(old, new, regex=False)

# Save the corrected CSV file
df.to_csv('corrected_predicted_knowledge_graph_based.csv', index=False)

print("Encoding issues fixed and saved to corrected_predicted_knowledge_graph_based.csv")


Encoding issues fixed and saved to corrected_predicted_knowledge_graph_based.csv
