In [2]:
# Loading in the libraries
import pandas as pd
import nltk
import swifter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pickle


In [3]:
#Loadin in the data
Reviews = pd.read_csv('RAW_interactions.csv')
recipes = pd.read_csv('RAW_recipes.csv')

In [4]:
#Getting an overview of the Reviews data
Reviews.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,17/02/2003,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,21/12/2011,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,01/12/2002,4,This worked very well and is EASY. I used not...
3,126440,85009,27/02/2010,5,I made the Mexican topping and took it to bunk...
4,57222,85009,01/10/2011,5,"Made the cheddar bacon topping, adding a sprin..."


In [5]:
# Getting an overview of the recipe data
recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,16/09/2005,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,17/06/2002,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,25/02/2005,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,14/04/2003,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,25/10/2002,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [6]:
Reviews.columns

Index(['user_id', 'recipe_id', 'date', 'rating', 'review'], dtype='object')

In [7]:
recipes.columns

Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')

In [8]:
# Merging the two dataset on recipe id
combined_data = Reviews.merge(recipes, left_on='recipe_id', right_on='id', how='left')

In [9]:
combined_data.head()

Unnamed: 0,user_id,recipe_id,date,rating,review,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,38094,40893,17/02/2003,4,Great with a salad. Cooked on top of stove for...,white bean green chile pepper soup,40893,495,1533,21/09/2002,"['weeknight', 'time-to-make', 'course', 'main-...","[204.8, 5.0, 9.0, 26.0, 24.0, 2.0, 10.0]",4,"['combine beans , onion , chilies , 1 / 2 teas...",easy soup for the crockpot.,"['great northern beans', 'yellow onion', 'dice...",9
1,1293707,40893,21/12/2011,5,"So simple, so delicious! Great for chilly fall...",white bean green chile pepper soup,40893,495,1533,21/09/2002,"['weeknight', 'time-to-make', 'course', 'main-...","[204.8, 5.0, 9.0, 26.0, 24.0, 2.0, 10.0]",4,"['combine beans , onion , chilies , 1 / 2 teas...",easy soup for the crockpot.,"['great northern beans', 'yellow onion', 'dice...",9
2,8937,44394,01/12/2002,4,This worked very well and is EASY. I used not...,devilicious cookie cake delights,44394,20,56824,27/10/2002,"['30-minutes-or-less', 'time-to-make', 'course...","[132.3, 11.0, 39.0, 5.0, 4.0, 11.0, 5.0]",5,"['blend together cake mix , oil and eggs', 'ad...",,"[""devil's food cake mix"", 'vegetable oil', 'eg...",4
3,126440,85009,27/02/2010,5,I made the Mexican topping and took it to bunk...,baked potato toppings,85009,10,64342,25/02/2004,"['15-minutes-or-less', 'time-to-make', 'course...","[2786.2, 342.0, 134.0, 290.0, 161.0, 301.0, 42.0]",3,['pick whichever topping you want to use and c...,these toppings sure makes a nice change from p...,"['mayonnaise', 'salsa', 'cheddar cheese', 'ref...",13
4,57222,85009,01/10/2011,5,"Made the cheddar bacon topping, adding a sprin...",baked potato toppings,85009,10,64342,25/02/2004,"['15-minutes-or-less', 'time-to-make', 'course...","[2786.2, 342.0, 134.0, 290.0, 161.0, 301.0, 42.0]",3,['pick whichever topping you want to use and c...,these toppings sure makes a nice change from p...,"['mayonnaise', 'salsa', 'cheddar cheese', 'ref...",13


# Data Cleaning

In [10]:
combined_data = combined_data.sample(frac=0.4, random_state=42)

In [11]:
# Checking for missing values
combined_data.isna().sum()

user_id              0
recipe_id            0
date                 0
rating               0
review              62
name                 0
id                   0
minutes              0
contributor_id       0
submitted            0
tags                 0
nutrition            0
n_steps              0
steps                0
description       8988
ingredients          0
n_ingredients        0
dtype: int64

In [12]:
# Dropping all the missing values
combined_data.dropna(inplace=True)

In [13]:
# Checking ifthe change has been effected
combined_data.isna().sum()

user_id           0
recipe_id         0
date              0
rating            0
review            0
name              0
id                0
minutes           0
contributor_id    0
submitted         0
tags              0
nutrition         0
n_steps           0
steps             0
description       0
ingredients       0
n_ingredients     0
dtype: int64

In [14]:
# Checking for duplicates
combined_data.duplicated().sum()

0

In [15]:
# Filtering to remain with only the most important columns
combined_data = combined_data.drop(columns = ['minutes', 'contributor_id', 'tags', 'nutrition', 'n_steps', 'steps', 'n_ingredients', 'id', ])

In [16]:
combined_data.head()

Unnamed: 0,user_id,recipe_id,date,rating,review,name,submitted,description,ingredients
781974,288985,161466,30/09/2011,4,"Did not have a problem w/ these going ""stale"" ...",peanut butter crispy rice treats,25/03/2006,i found this recipe in the paper the other day...,"['butter', 'sugar', 'light corn syrup', 'peanu..."
937737,20623,24582,15/09/2003,4,"Very good roast. However, the strong flavors -...",garlic horseradish sirloin roast,08/04/2002,"did this for dinner today, very delicious with...","['top sirloin roast', 'garlic cloves', 'light ..."
907828,227586,281356,31/08/2008,5,"This was AWESOME! So summery, so quick just o...",fresh tomato and basil pasta with toasted pine...,24/01/2008,this is the essence of summer - fresh tomatoes...,"['penne pasta', 'olive oil', 'onion', 'garlic ..."
784628,539686,161549,07/04/2008,4,"I used thawed, frozen peaches and even though ...",whiskey peach smash,27/03/2006,"from the ""king of cocktails"" dale degroff, thi...","['whiskey', 'peach', 'mint leaves', 'lemon', '..."
662460,1540883,265601,03/02/2010,0,"Sugar does not belong in cornbread, at least n...",good eats creamed corn cornbread alton brown,13/11/2007,this recipe produces a very light and airy cor...,"['yellow cornmeal', 'kosher salt', 'sugar', 'b..."


# Sentiment Analysis

In [None]:

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove non-alphabetic characters (keep only words)
    text = re.sub(r"[^a-z\s]", "", text)
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words("english")]
    
    return " ".join(words)  # Return cleaned text as a string
combined_data['Cleaned_review'] = combined_data['review'].swifter.apply(preprocess_text)

Pandas Apply:   0%|          | 0/410380 [00:00<?, ?it/s]

In [18]:
#combined_data.to_csv('cleaned_combined_data.csv', index=False)

In [21]:
# Save the function
with open("preprocessor.pkl", "wb") as f:
    pickle.dump(preprocess_text, f)