# ✅Step 5. Linking Reddit and BBC Good Food

# 🎯0. Import libraries

In [1]:
import requests               
import pandas as pd
from scrapy import Selector  
from tqdm import tqdm
from datetime import datetime
import sys
from plotnine import *
import altair as alt


from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


# Import our own modules
sys.path.append("../scripts/")
import chadtools

# 🎯1. Prepare GET requests for BBC GoodFood

In [2]:
bbclink = 'https://www.bbcgoodfood.com/search?q='
response = requests.get(bbclink)
sel = Selector(text=response.text)
links = []

recipecards = sel.css('main div.search-results div.card__section.card__content a ::attr(href)').getall()
links.extend(recipecards)  
prefix = "https://www.bbcgoodfood.com/recipes/"
final_links = [prefix + item for item in links] 

In [3]:
base_url = 'https://www.bbcgoodfood.com/search?q=&limit=1000&page='

links = []
for i in tqdm(range(1, 11)):
    response = requests.get(base_url + str(i))
    sel = Selector(text = response.text)
    recipecards = sel.css('main div.search-results div.card__section.card__content a ::attr(href)').getall()
    links.extend(recipecards)  

prefix = "https://www.bbcgoodfood.com/recipes/"
final_links = [prefix + item for item in links] 

len(final_links)

100%|██████████| 10/10 [02:02<00:00, 12.24s/it]


10011

## 🎯2. Scrape BBC GoodFood Recipes

We will scrape all of BBC GoodFood's recipes and create a DataFrame containing the nutritional data of each of them, i.e. how much salt, fat, protein, etc. each recipe contains.

<div class="alert alert-warning">
    <strong>WARNING:</strong> This code block takes over 3 hours to run. For testing, use the pre-saved file <code>bbc_data.csv</code> in the next code block.
</div>

In [None]:
session = requests.Session()

def get_nutrition(url):
    response2 = session.get(url)
    sel2 = Selector(text=response2.text)
    table = sel2.css('table.key-value-blocks.hidden-print.mt-xxs')
    bbc = {}
    bbc['bbcgf_title'] = sel2.css('h1.heading-1 ::text').get()
    bbc['calories'] = table.css('td.key-value-blocks__value ::text').get()
    bbc['salt'] = table.xpath('.//*[contains(text(), "salt")]/..').css('td.key-value-blocks__value ::text').get()
    bbc['fat'] = table.xpath('.//*[contains(text(), "fat")]/..').css('td.key-value-blocks__value ::text').get()
    bbc['sugars'] = table.xpath('.//*[contains(text(), "sugars")]/..').css('td.key-value-blocks__value ::text').get()
    bbc['saturates'] = table.xpath('.//*[contains(text(), "saturates")]/..').css('td.key-value-blocks__value ::text').get()
    bbc['carbs'] = table.xpath('.//*[contains(text(), "carbs")]/..').css('td.key-value-blocks__value ::text').get()
    bbc['protein'] = table.xpath('.//*[contains(text(), "protein")]/..').css('td.key-value-blocks__value ::text').get()
    bbc['fibre'] = table.xpath('.//*[contains(text(), "fibre")]/..').css('td.key-value-blocks__value ::text').get()
    bbc['bbcgf_ratings_raw'] = sel2.css("div.rating__values span.sr-only ::text").get()
    return bbc

bbc = [get_nutrition(url) for url in tqdm(final_links)]

df_bbc = pd.DataFrame(bbc)
df_bbc['bbcgf_ratings'] = df_bbc['bbcgf_ratings_raw'].str.extract(r'(\d+\.\d+)').astype(float) / 5.0 
df_bbc.head(10)

### Save the data as a CSV file 

In [None]:
df_bbc.to_csv('../data/bbc_data.csv', index=False)

Save the data as HTML for website

In [None]:
df_bbc.head().to_html('../docs/bbc_data.html')

### Read cleaned_posts_with_cuisine_and_ingredient_list.json and bbc_data.csv

In [6]:
df_filtered = pd.read_json('../data/cleaned_posts_with_cuisine_and_ingredient_list.json', orient='records')
df_filtered.tail(3)

Unnamed: 0,id,title,gpt_ingredients,cuisine,ingredient_comment,score,upvote_ratio,link_flair_text,author,created_utc,url,permalink
1088,iz12pg,Ottolenghi's Baked Orzo w/Mozzarella,"[olive oil, eggplant, carrots, celery, onion, ...",italian,Ingredients:\n\n* 7 Tablespoons olive oil\n* 1...,23,0.839844,Fruit\Vegetarian,BrinaElka,1600970345000,https://i.redd.it/l7osuhkcm4p51.jpg,https://reddit.com/r/recipes/comments/iz12pg/o...
1089,iw3wli,Mushroom Barley Stew with Crispy Oyster Mushrooms,"[mushroom barley stew, neutral oil for frying,...",stew.,**Recipe here originally:** [**Easy Mushroom B...,2695,0.97998,Fruit\Vegetarian,BushyEyes,1600565227000,https://i.redd.it/511qxuct57o51.jpg,https://reddit.com/r/recipes/comments/iw3wli/m...
1090,isunwt,Easy Tomato Risotto with Parmesan,"[tomato risotto, ripe tomatoes, garlic, extra ...",italian,**Recipe here originally:** [**Easy Tomato Ris...,1801,0.990234,Fruit\Vegetarian,BushyEyes,1600122747000,https://i.redd.it/0qb76yy3m6n51.jpg,https://reddit.com/r/recipes/comments/isunwt/e...


In [7]:
df_bbc = pd.read_csv('../data/bbc_data.csv')
df_bbc.head()

Unnamed: 0,bbcgf_title,calories,salt,fat,sugars,saturates,carbs,protein,fibre,bbcgf_ratings_raw,bbcgf_ratings
0,Chicken & chorizo jambalaya,445.0,1.2,10.0,7.0,3.0,64.0,30.0,2.0,A star rating of 4.8 out of 5.,0.96
1,Lemon drizzle cake,399.0,0.3,21.0,33.0,13.0,50.0,5.0,1.0,A star rating of 4.7 out of 5.,0.94
2,Chilli con carne recipe,387.0,2.32,17.0,1.0,6.0,25.0,36.0,6.0,A star rating of 4.8 out of 5.,0.96
3,Best ever chocolate brownies recipe,150.0,0.1,9.0,12.0,5.0,15.0,2.0,1.0,A star rating of 4.8 out of 5.,0.96
4,Creamy courgette lasagne,405.0,1.36,21.0,13.0,8.0,38.0,18.0,4.0,A star rating of 4.6 out of 5.,0.92


In [8]:
df_bbc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10011 entries, 0 to 10010
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   bbcgf_title        10009 non-null  object 
 1   calories           9951 non-null   float64
 2   salt               9942 non-null   float64
 3   fat                9949 non-null   float64
 4   sugars             9950 non-null   float64
 5   saturates          9950 non-null   float64
 6   carbs              9951 non-null   float64
 7   protein            9950 non-null   float64
 8   fibre              9946 non-null   float64
 9   bbcgf_ratings_raw  10008 non-null  object 
 10  bbcgf_ratings      7740 non-null   float64
dtypes: float64(9), object(2)
memory usage: 860.4+ KB


Save a sample of the data as HTML for display on webpage

In [9]:
df_bbc.head().to_html('../docs/bbc_data.html', render_links=True, index=False)

## 🎯3. Merging the Reddit and BBC GoodFood dataframes 

How do we link the recipes from the two sites together? We use the following steps:
1. Calculate cosine similarity between the recipes from the two sites
2. If the cosine similarity is above our threshold (0.75), we consider the recipes to be the same.
    1. If there are multiple recipes that are above our threshold, we select the one with the highest similarity.
3. We then merge the two dataframes on the recipe title 

In [10]:
df_reddit = df_filtered.copy()

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
similarity_threshold = 0.75
merge_columns = ['bbcgf_title', 'calories', 'salt', 'fat', 'sugars', 'saturates', 'carbs', 'protein', 'fibre', 'bbcgf_ratings']

# Encode titles using Sentence Transformer
embeddings_reddit = model.encode(df_reddit['title'].tolist())
embeddings_bbc = model.encode(df_bbc['bbcgf_title'].tolist())

# Calculate cosine similarity matrix
cosine_sim_matrix = cosine_similarity(embeddings_reddit, embeddings_bbc)

# Find the indices and values of the maximum similarity in each row
max_similarity_indices = cosine_sim_matrix.argmax(axis=1)
max_similarity_values = cosine_sim_matrix.max(axis=1)

# Mask indices where the similarity is below the threshold
mask = max_similarity_values > similarity_threshold

# Merge columns based on the max similarity
df_reddit.loc[mask, merge_columns] = df_bbc.iloc[max_similarity_indices[mask]][merge_columns].values

df_reddit.tail()



Unnamed: 0,id,title,gpt_ingredients,cuisine,ingredient_comment,score,upvote_ratio,link_flair_text,author,created_utc,...,bbcgf_title,calories,salt,fat,sugars,saturates,carbs,protein,fibre,bbcgf_ratings
1086,jcgb7j,Bitter gourd yogurt curry....with no bitternes...,"[oil, bitter gourd, onion, green chillies, cur...",indian,Recipe.....\n\n[Short Video](https://youtu.be/...,8,0.660156,Fruit\Vegetarian,PassionateHobbies,1602879492000,...,,,,,,,,,,
1087,jb5peu,Punjabi Aloo Samosa,"[plain flour, carom seeds, salt, ghee, water, ...",indian,For video instruction follow this link: [http...,39,0.959961,Fruit\Vegetarian,Pakladies,1602701494000,...,Samosa chaat,366.0,2.0,19.0,5.0,5.0,33.0,11.0,7.0,
1088,iz12pg,Ottolenghi's Baked Orzo w/Mozzarella,"[olive oil, eggplant, carrots, celery, onion, ...",italian,Ingredients:\n\n* 7 Tablespoons olive oil\n* 1...,23,0.839844,Fruit\Vegetarian,BrinaElka,1600970345000,...,Baked tomato & mozzarella orzo,546.0,0.99,18.0,9.0,9.0,67.0,26.0,5.0,0.84
1089,iw3wli,Mushroom Barley Stew with Crispy Oyster Mushrooms,"[mushroom barley stew, neutral oil for frying,...",stew.,**Recipe here originally:** [**Easy Mushroom B...,2695,0.97998,Fruit\Vegetarian,BushyEyes,1600565227000,...,,,,,,,,,,
1090,isunwt,Easy Tomato Risotto with Parmesan,"[tomato risotto, ripe tomatoes, garlic, extra ...",italian,**Recipe here originally:** [**Easy Tomato Ris...,1801,0.990234,Fruit\Vegetarian,BushyEyes,1600122747000,...,Easy baked tomato risotto,411.0,1.3,,3.0,6.0,65.0,14.0,3.0,0.9


We drop rows with no nutritional data to get a clean dataframe for further analysis.

In [11]:
df_for_analysis = df_reddit.dropna(subset=merge_columns, how='any').copy()
df_for_analysis.info()

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 1 to 1088
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  500 non-null    object 
 1   title               500 non-null    object 
 2   gpt_ingredients     500 non-null    object 
 3   cuisine             500 non-null    object 
 4   ingredient_comment  500 non-null    object 
 5   score               500 non-null    int64  
 6   upvote_ratio        500 non-null    float64
 7   link_flair_text     500 non-null    object 
 8   author              500 non-null    object 
 9   created_utc         500 non-null    int64  
 10  url                 500 non-null    object 
 11  permalink           500 non-null    object 
 12  bbcgf_title         500 non-null    object 
 13  calories            500 non-null    float64
 14  salt                500 non-null    float64
 15  fat                 500 non-null    float64
 16  sugars      

In [12]:
df_for_analysis.iloc[:, 13:] = df_for_analysis.iloc[:, 13:].astype("float16")
df_for_analysis["score"] = df_for_analysis["score"].astype("int16")
df_for_analysis["upvote_ratio"] = df_for_analysis["upvote_ratio"].astype("float16")

In [13]:
df_for_analysis.info()

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 1 to 1088
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  500 non-null    object 
 1   title               500 non-null    object 
 2   gpt_ingredients     500 non-null    object 
 3   cuisine             500 non-null    object 
 4   ingredient_comment  500 non-null    object 
 5   score               500 non-null    int16  
 6   upvote_ratio        500 non-null    float16
 7   link_flair_text     500 non-null    object 
 8   author              500 non-null    object 
 9   created_utc         500 non-null    int64  
 10  url                 500 non-null    object 
 11  permalink           500 non-null    object 
 12  bbcgf_title         500 non-null    object 
 13  calories            500 non-null    float64
 14  salt                500 non-null    float64
 15  fat                 500 non-null    float64
 16  sugars      

In [14]:
df_for_analysis.head()

Unnamed: 0,id,title,gpt_ingredients,cuisine,ingredient_comment,score,upvote_ratio,link_flair_text,author,created_utc,...,bbcgf_title,calories,salt,fat,sugars,saturates,carbs,protein,fibre,bbcgf_ratings
1,1ah8m5s,Thai Green Curry Chicken Satay,"[chicken satay, homemade green curry paste, ve...",thai,You could use any curry paste you like. Do you...,49,0.879883,Recipe,butchec,1706893611000,...,Thai green chicken curry,257.0,0.600098,15.0,3.0,10.0,9.0,19.0,2.0,0.899902
4,1afoma3,Sweet and Sour Tofu,"[tofu, ground black pepper, salt, cornstarch, ...",chinese,I love figuring out ways to make tofu deliciou...,59,0.890137,Recipe,parisrosaries,1706725321000,...,Sweet & sour tofu,530.0,1.200195,17.0,18.0,2.0,75.0,15.0,8.0,0.819824
9,1acagoz,Chocolate Fudgy Brownie,"[butter, dark chocolate, cocoa powder, white s...",american,Full Chocolate Fudgy Brownie recipe: https://w...,114,0.910156,Recipe,butchec,1706360858000,...,Fudgy brownies,1043.0,0.620117,62.0,76.0,35.0,105.0,14.0,6.0,0.859863
12,19d0wfc,Buffalo Chicken Tenders,"[chicken tenderloins, flour, garlic powder, eg...",american,**Recipe here originally:** [**Buffalo Chicken...,275,0.970215,Recipe,BushyEyes,1705944195000,...,Buffalo chicken,520.0,7.398438,35.0,11.0,15.0,11.0,39.0,3.0,0.879883
13,1998zka,Prawn Katsu Baos,"[kewpie, plain yoghurt, dill pickles, capers, ...",japanese,This one is high impact and a showstopper for ...,291,0.950195,Recipe,TheLuckiestDragon,1705528588000,...,Prawn katsu burgers,1070.0,2.800781,74.0,13.0,12.0,68.0,30.0,6.0,0.899902


In [15]:
df_for_analysis.to_json('../data/merged_data_for_analysis.json', orient='records', indent=4)

Saving as html for website

In [16]:
df_for_analysis.head().to_html('../docs/merged_data_for_analysis.html', render_links=True, index=False)

## Analysis of merged Reddit and BBC Good Food data

### Plotting histograms of BBC Good Food ratings against each flair 
Having obtained the user ratings from BBC Good Food, we can compare if there are any differences between the category of foods that are popular amongst Reddit vs BBC Good Food users. 

In [17]:
df_for_analysis = pd.read_json('../data/merged_data_for_analysis.json', orient='records')

In [18]:
df_desserts = df_for_analysis[df_for_analysis['link_flair_text']=='Dessert']
df_recipe = df_for_analysis[df_for_analysis['link_flair_text']=='Recipe']
df_pasta = df_for_analysis[df_for_analysis['link_flair_text']=='Pasta']
df_poultry = df_for_analysis[df_for_analysis['link_flair_text']=='Poultry']
df_drink = df_for_analysis[df_for_analysis['link_flair_text']=='Drink']
df_beef = df_for_analysis[df_for_analysis['link_flair_text']=='Beef']
df_pork = df_for_analysis[df_for_analysis['link_flair_text']=='Pork']
df_seafood = df_for_analysis[df_for_analysis['link_flair_text']=='Seafood']
df_fruitveg = df_for_analysis[df_for_analysis['link_flair_text']=='Fruit\Vegetarian']

In [20]:
# plot a histogram of the bbc ratings for each flair category
flair_names = ['Dessert', 'Recipe', 'Pasta', 'Poultry', 'Drink', 'Beef', 'Pork', 'Seafood', 'Fruit\Vegetarian']

for flair in flair_names:
    plot = (
        ggplot(df_for_analysis[df_for_analysis["link_flair_text"] == flair], aes(x='bbcgf_ratings')) + 
        geom_histogram(binwidth=0.01, fill='#5c3da4') +
        theme_matplotlib() + 
        ggtitle(f"Frequency of {flair} posts by BBC Good Food ratings ratio") +
        theme(plot_title = element_text(weight='bold', color="black")) +
        theme(aspect_ratio=9/16)
    )

    plot.save(f"../plots/{flair}_hist.png".replace("\\", ""), dpi=300) 



### Plot histogram of top 10 percent of BBC GoodFood recipes by website ratings

In [21]:
# Sort by 'upvote ratio' in descending order
df_sorted = df_for_analysis.sort_values(by='bbcgf_ratings', ascending=False)

# Calculate the number of rows for the top 10%
top_10_percent = int(0.1 * len(df_sorted))

# Take the top 10% of the DataFrame
top_10_df = df_sorted.head(top_10_percent)

ordered_flair_list = top_10_df['link_flair_text'].value_counts().index.tolist()
# Plot a bar graph showing the number of posts from different flairs
plot = (
       ggplot(top_10_df, aes(x="link_flair_text")) + 
       geom_bar(fill='#5c3da4') +
       coord_flip() +
       ggtitle("Top 10% of posts by BBC Good Food ratings") +
       scale_x_discrete(limits=top_10_df["link_flair_text"].value_counts().index.tolist()[::-1]) +
       theme(plot_title = element_text(weight='bold', color="black")) +
       theme(aspect_ratio=9/16)
       )

plot.save(f"../plots/plot_top_10_percent_bbcgf.jpg", format="jpg", dpi=600)



### Plotting an interactive scatter plot of calories against upvote ratio

For easier visualisation, we combine the 'Poultry', 'Beef', 'Pork', 'Seafood' flairs into one flair titled 'Meats'.

In [22]:
df_for_altair = df_for_analysis.copy()

df_for_altair['link_flair_text'].replace(['Poultry', 'Beef', 'Pork', 'Seafood'], 'Meats', inplace=True)

In [23]:
x_scale = alt.Scale(domain=(0.5, 1))
colour_scale = alt.Scale(domain=['Dessert', 'Recipe', 'Pasta', 'Meats', 'Drink', 'Fruit\\Vegetarian'],
                         range=['#41afaa', '#466eb4', '#00a0e1', '#e6a532', '#d7642c', '#af4b91'])

selection = alt.selection_point(fields=['link_flair_text'], bind='legend')

scatter = alt.Chart(df_for_altair, width=600, height=600).mark_circle().encode(
    x=alt.X('upvote_ratio:Q', scale=x_scale, title='Upvote Ratio'),
    y=alt.Y('calories:Q', title='Calories'), 
    color=alt.Color('link_flair_text:N', scale=colour_scale, legend=alt.Legend(title='Flair')),
    tooltip=['title:N', 'upvote_ratio:Q', 'link_flair_text:N', 'calories:Q', 'salt:Q', 'fat:Q', 'sugars:Q', 'saturates:Q', 'carbs:Q', 'protein:Q', 'fibre:Q', 'permalink:N'],
    opacity=alt.condition(selection, alt.value(1), alt.value(0.15)),
    href='permalink:N'
).add_params(
    selection
).properties(title='No clear correlation between the healthiness of a recipe and how well it is received on Reddit',
).interactive()

scatter.save('../docs/interactive_plot.html', scale_factor=2.0)



We then plot an interactive scatter plot of reddit upvote ratios against BBC GoodFood ratings. Unsurprisingly, higher rated recipes on BBC GoodFood tend to have higher upvote ratios on Reddit.

In [24]:
x_scale = alt.Scale(domain=(0.5, 1))
y_scale = alt.Scale(domain=(0.5, 1))

scatter = alt.Chart(df_for_altair, width=600, height=600).mark_circle().encode(
    x=alt.X('upvote_ratio:Q', scale=x_scale, title='Upvote Ratio'),
    y=alt.Y('bbcgf_ratings:Q', scale=y_scale, title='BBC Good Food Rating'),
    tooltip=['title:N', 'upvote_ratio:Q', 'bbcgf_ratings:Q', 'permalink:N'],
    href='permalink:N'
).properties(title='Higher Reddit upvote ratios are generally associated with higher BBC Good Food ratings'
).interactive()

scatter.save('../docs/upvote_ratio_vs_bbcgf_rating.html', scale_factor=2.0) 



Further analysis of specific ingredients and cuisines will be done in the next notebook.