## Separating the Soups from the Reviews

In [1]:
import pandas as pd
import spacy

In [3]:
soup_names_list = [
    "Lobster Bisque",
    "Jambalaya",
    "Shrimp Corn Chowder",
    "New England Clam Chowder",
    "Shrimp and Sausage Gumbo",
    "Chicken Noodle",
    "Chicken Dumpling",
    "Chicken Corn Chowder",
    "Buffalo Chicken",
    "Chicken Tortilla",
    "Chicken Enchilada",
    "Chicken and Wild Rice",
    "Chicken Barley",
    "Chicken Orzo",
    "Chicken Quinoa",
    "Split Pea with Ham",
    "Beef Chili",
    "Potato Bacon",
    "Chicken Chili",
    "Chicken Lentil",
    "Italian Wedding",
    "Chicken Feta Cheese and Spinach",
    "Sausage and Kale",
    "Chicken Gumbo",
    "Butternut Squash",
    "Garden Vegetable",
    "Tomato & Wild Rice",
    "Lentil",
    "Cuban Black Bean",
    "Tomato Basil",
    "Tomato Corn Chowder",
    "Tomato Bisque",
    "Tomato with Zucchini",
    "Tomato Orzo",
    "Broccoli and Cheese",
    "Italian Ravioli",
    "Minestrone",
    "Potato Leek",
    "Vegetarian Chili",
    "Chicken Bone Broth",
    "Lobster Roll",
    "Mushroom Barley"
]

In [4]:
# Reading in the reviews as a dataframe
reviews = pd.read_csv('soupman_reviews.csv')

**Now it is time to see how effective the new model is at extracting the soup names from reviews, I will do this by creating a new column that holds the extracted soup names**

In [5]:
import re

def create_soup_regex_patterns(soup_names_list):
    # Escape special characters in soup names and join them into a single regex pattern
    pattern = '|'.join(re.escape(name) for name in soup_names_list)
    return re.compile(pattern, re.IGNORECASE)  # Compile regex pattern with case insensitivity

In [6]:
def extract_soup_names_with_regex(text, soup_names_list):
    soup_names_found = []
    soup_name_dict = {soup.split(" ")[0]: False for soup in soup_names_list}  # Create dictionary with soup names without "soup"
    
    # Check for each soup name in the text
    for soup in soup_names_list:
        # Create regex pattern for each soup name, considering potential punctuation
        soup_regex = r"\b" + re.escape(soup) + r"\b"
        # Search for soup name in text, ignoring case
        if re.search(soup_regex, text, re.IGNORECASE):
            soup_names_found.append(soup)
            # Mark soup name as found in dictionary
            soup_name_dict[soup.split(" ")[0]] = True  
    
    # Check for soup names without "soup" in dictionary and add them if they haven't been found
    for soup, found in soup_name_dict.items():
        if not found:
            soup_with_soup = soup + " Soup"
            # Create regex pattern for each soup name with "soup", considering potential punctuation
            soup_with_soup_regex = r"\b" + re.escape(soup_with_soup) + r"\b"
            # Search for soup name with "soup" in text, ignoring case
            if re.search(soup_with_soup_regex, text, re.IGNORECASE):
                soup_names_found.append(soup_with_soup)

    return soup_names_found

In [7]:
reviews['soup_names_found'] = reviews['review'].apply(lambda x: extract_soup_names_with_regex(x, soup_names_list))
#pd.set_option('display.max_rows', None)
print(reviews['soup_names_found'])

0                                   [Chicken Bone Broth]
1                       [Chicken Dumpling, Lobster Roll]
2                                      [Italian Wedding]
3                     [Lobster Bisque, Chicken Tortilla]
4      [Lobster Bisque, Chicken Dumpling, Chicken Tor...
                             ...                        
599                        [Chicken Gumbo, Lobster Roll]
600                                         [Minestrone]
601                                     [Lobster Bisque]
602                                       [Lobster Soup]
603                                     [Lobster Bisque]
Name: soup_names_found, Length: 604, dtype: object


In [8]:
empty_lists_count = sum(1 for soup_names in reviews['soup_names_found'] if not soup_names)
print("Number of empty lists:", empty_lists_count)

Number of empty lists: 143


In [9]:
non_empty_lists_count = sum(1 for soup_names in reviews['soup_names_found'] if soup_names)
print("Number of non-empty lists:", non_empty_lists_count)

Number of non-empty lists: 461


In [10]:
reviews.to_csv("reviews_and_soups.csv", index = False)