In [1]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from utils import get_api_key

# todo - clean up!

In [2]:
# Guardian API setup
API_KEY = get_api_key("guardian_api_key")
BASE_URL = "https://content.guardianapis.com/search"

In [3]:
# Define date range (past year)
today = datetime.now()
one_year_ago = today - timedelta(days=365)
from_date = one_year_ago.strftime("%Y-%m-%d")
to_date = today.strftime("%Y-%m-%d")

# Function to fetch articles
def fetch_articles(page=1):
    params = {
        "api-key": API_KEY,
        "section": "food",
        "q": "recipe",
        "from-date": from_date,
        "to-date": to_date,
        "page": page,
        "page-size": 10,  # Adjust as needed
        "show-fields": "byline,headline,trailText,thumbnail,bodyText",
        "show-tags": "keyword,contributor,tone",
    }
    response = requests.get(BASE_URL, params=params)
    response.raise_for_status()
    return response.json()

# Fetch and process articles
def get_recipes():
    page = 1
    all_articles = []
    while True:
        data = fetch_articles(page)
        articles = data["response"]["results"]
        if not articles:
            break
        for article in articles:
            tags = [tag["webTitle"] for tag in article.get("tags", [])]
            contributors = [tag["webTitle"] for tag in article.get("tags", []) if tag["type"] == "contributor"]
            all_articles.append({
                "title": article["fields"].get("headline", article["webTitle"]),
                "author": article["fields"].get("byline", "; ".join(contributors)),
                "url": article["webUrl"],
                "publication_date": article["webPublicationDate"],
                "summary": article["fields"].get("trailText", ""),
                "tags": ", ".join(tags),
                "thumbnail": article["fields"].get("thumbnail", ""),
                "body": article["fields"].get("bodyText", ""),
            })
        page += 1
        if page > data["response"]["pages"]:
            break
    return all_articles

# Fetch data and save as CSV
recipes = get_recipes()
df = pd.DataFrame(recipes)

In [None]:
df.head()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Set global DPI for all plots
plt.rcParams['figure.dpi'] = 150
plt.rcParams['figure.figsize'] = (8,6)

# Ensure the 'tags' column is properly split into lists
df['tags'] = df['tags'].str.split(', ')

# Count occurrences of authors and tags
author_counts = df['author'].value_counts().head(10)  # Top 10 authors
all_tags = [tag for tags_list in df['tags'].dropna() for tag in tags_list]
tags_counts = pd.Series(all_tags).value_counts().head(10)  # Top 10 tags

# Plot bar chart for top 10 authors
plt.figure()
author_counts.plot(kind='bar', color='skyblue', title='Top 10 Authors by Article Count')
plt.xlabel('Authors')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Plot bar chart for top 10 tags
plt.figure()
tags_counts.plot(kind='bar', color='orange', title='Top 10 Tags by Count')
plt.xlabel('Tags')
plt.ylabel('Number of Occurrences')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
df['short_context'] = df['title'] + ': ' + df['summary'] + 'TAGS: ' + df['tags'].map(lambda x: ', '.join(x)) 
df.head()

In [None]:
from transformers import pipeline
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import os

# Initialize zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Broad cuisine categories
cuisine_labels = [
    "Italian", "Indian", "Asian", "Mexican", "European", 
    "American", "Mediterranean", 
    "Middle Eastern", "African", "Desert", "Other"
]

# Function to classify a batch of texts
def classify_batch(batch_texts):
    results = classifier(batch_texts, cuisine_labels, multi_label=False)
    return [result['labels'][0] for result in results]

# Determine the number of available threads
num_threads = os.cpu_count()
print(f"Using {num_threads} threads for parallel processing.")

# Split the data into chunks for parallel processing
chunk_size = len(df) // num_threads + 1
chunks = [df.iloc[i:i + chunk_size] for i in range(0, len(df), chunk_size)]
print(f"Splitting {len(df)} rows into chunks of len = {chunk_size}.")


# Perform classification in parallel with a loading bar
predicted_cuisines = []
for chunk in tqdm(chunks, desc="Processing chunks"):
    predicted_cuisines.extend(classify_batch(chunk['short_context'].tolist()))

# Add the predictions to the DataFrame
df['cuisine'] = predicted_cuisines

In [None]:
df[['title','cuisine']].head()


In [None]:
import matplotlib.pyplot as plt

# Bar plot for cuisine counts
cuisine_counts = df['cuisine'].value_counts()

# Plot the cuisines
plt.figure(figsize=(10, 6), dpi=300)
cuisine_counts.plot(kind='bar', color='skyblue', title='Recipe Counts by Cuisine')
plt.xlabel('Cuisine')
plt.ylabel('Number of Recipes')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# to do -- add mkdir of data
df.to_csv("../data/guardian_recipes.csv", index=False)
print(f"Saved {len(recipes)} recipes to 'guardian_recipes_with_tags.csv'")