In [6]:
import re
from nltk.tokenize import sent_tokenize
import nltk

# Expanded list of keywords related to food, ingredients, potions, spells, and rituals
ingredient_keywords = [
    'ingredient', 'ingredients', 'potion', 'potion ingredient', 'spell', 'spell ingredient', 'ritual', 'ritual ingredient',
    'charm', 'herb', 'herbs', 'root', 'roots', 'flower', 'flowers', 'berry', 'berries', 'powder', 'dust', 'brew', 'extract',
    'elixir', 'drink', 'meat', 'fruit', 'vegetable', 'plant', 'fungus', 'fungi', 'mushroom', 'spice', 'concoction', 'potion',
    'draught', 'infusion', 'elixir', 'tincture', 'brew', 'decoction', 'philter', 'mixture', 'brew', 'draught', 'powder', 
    'essence', 'juice', 'blood', 'venom', 'syrup', 'essence', 'tonic', 'salve'
]

# Function to check if a sentence contains any ingredient-related keywords
def contains_ingredient_keywords(sentence, keywords):
    return any(re.search(rf'\b{re.escape(keyword)}\b', sentence, re.IGNORECASE) for keyword in keywords)

# Load the preprocessed text data
with open('second_half.txt', 'r') as file:
    text = file.read()

# Tokenize the text into sentences
sentences = sent_tokenize(text)

# Filter sentences related to ingredients
ingredient_related_sentences = [sentence for sentence in sentences if contains_ingredient_keywords(sentence, ingredient_keywords)]

# Save the filtered sentences to a new file
with open('ingredient_related_sentences.txt', 'w') as file:
    for sentence in ingredient_related_sentences:
        file.write(sentence + '\n')

print(f'Found {len(ingredient_related_sentences)} ingredient-related sentences.')


Found 130 ingredient-related sentences.


In [1]:
import nltk
import csv
import re
from nltk import pos_tag, word_tokenize, ne_chunk
from nltk.chunk import tree2conlltags
from nltk.tokenize import sent_tokenize

nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/wassu/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /Users/wassu/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/wassu/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [4]:
# Load your text file
with open('second_half.txt', 'r') as file:
    text = file.read()

# Tokenize the text into sentences
sentences = sent_tokenize(text)

ingredient_keywords = [
    'ingredient', 'ingredients', 'potion', 'potion ingredient', 'spell', 'spell ingredient', 'ritual', 'ritual ingredient',
    'charm', 'herb', 'herbs', 'root', 'roots', 'flower', 'flowers', 'berry', 'berries', 'powder', 'dust', 'brew', 'extract',
    'elixir', 'drink', 'meat', 'fruit', 'vegetable', 'plant', 'fungus', 'fungi', 'mushroom', 'spice', 'concoction', 'potion',
    'draught', 'infusion', 'elixir', 'tincture', 'brew', 'decoction', 'philter', 'mixture', 'brew', 'draught', 'powder', 
    'essence', 'juice', 'blood', 'venom', 'syrup', 'essence', 'tonic', 'salve'
]

# Function to extract potential ingredients from a sentence using POS tagging and NER
def extract_ingredients(sentence):
    ingredients = []
    # Tokenize the sentence into words
    words = word_tokenize(sentence)
    # Tag the words with their part of speech
    pos_tags = pos_tag(words)
    # Named Entity Recognition
    ner_tree = ne_chunk(pos_tags)
    # Convert NER tree to IOB format
    iob_tagged = tree2conlltags(ner_tree)
    
    # Simple noun phrase extraction (can be refined further)
    for word, pos, ner in iob_tagged:
        if pos in ['NN', 'NNS', 'NNP', 'NNPS'] and ner == 'O':  # Focus on nouns that are not named entities
            ingredients.append(word)
    
    # Additional regex-based extraction for ingredients following certain patterns
    additional_ingredients = re.findall(r'\b(of|with|containing|ingredient|ingredients)\s([\w\s-]+?)(?=[.,;]|and|or|$)', sentence, re.IGNORECASE)
    for match in additional_ingredients:
        ingredients.extend(match[1].split())  # Split to capture potential multiple-word ingredients
    
    return ingredients

# Initialize a list to hold all ingredient entries for the CSV
ingredient_entries = []

# Function to check if a sentence contains any ingredient-related keywords
def contains_ingredient_keywords(sentence, keywords):
    return any(re.search(rf'\b{re.escape(keyword)}\b', sentence, re.IGNORECASE) for keyword in keywords)

# Extract ingredients from each ingredient-related sentence
for i, sentence in enumerate(sentences):
    if contains_ingredient_keywords(sentence, ingredient_keywords):  # Reuse the function from your previous code to filter sentences
        ingredients = extract_ingredients(sentence)
        if ingredients:  # Only add if ingredients were found
            # Join ingredients as a single string for this entry
            ingredient_entries.append([i + 1, ', '.join(ingredients), sentence])

# Write the ingredient entries to a CSV file
with open('ingredients_database.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['No.', 'Ingredient', 'Source'])
    # Write the ingredient entries
    writer.writerows(ingredient_entries)

print(f"Extracted {len(ingredient_entries)} ingredients and saved to 'ingredients_database.csv'.")


Extracted 130 ingredients and saved to 'ingredients_database.csv'.
