In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import random
from urllib.parse import urljoin
import re
import json
from datetime import datetime

In [3]:
class MarmitonScraper:
    def __init__(self):
        self.base_url = "https://www.marmiton.org"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)

    def get_recipe_urls(self, category="plat-principal", max_pages=3):
        urls = []
        for page in range(1, max_pages + 1):
            try:
                search_url = f"{self.base_url}/recettes/recherche.aspx?aqt={category}&page={page}"
                response = self.session.get(search_url, timeout=10)
                soup = BeautifulSoup(response.content, 'html.parser')
                links = soup.find_all('a', href=re.compile(r'/recettes/recette_'))
                
                page_urls = [urljoin(self.base_url, link.get('href')) for link in links if link.get('href')]
                urls.extend(page_urls)
                print(f"‚úì Page {page} - {len(page_urls)} URLs trouv√©es")
                time.sleep(random.uniform(1, 2))
            except Exception as e:
                print(f"‚ùå Erreur page {page}: {e}")
        return list(set(urls))

    def extract_recipe_details(self, url):
        try:
            response = self.session.get(url, timeout=10)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            title = self._extract_title(soup)
            
            # la classification pour les statistiques
            recipe_data = {
                'url': url,
                'titre': title,
                'type_recette': self._classify_type(title),
                'cuisine': self._classify_cuisine(title),
                'ingredients': str(self._extract_ingredients(soup)),
                'etapes': str(self._extract_steps(soup)),
                'date_scraping': datetime.now().isoformat()
            }
            return recipe_data
        except Exception as e:
            return None

    def _extract_title(self, soup):
        title = soup.find('h1')
        return title.text.strip() if title else "Sans titre"

    def _classify_type(self, title):
        title = title.lower()
        if any(x in title for x in ['tarte', 'g√¢teau', 'mousse', 'dessert']): return 'dessert'
        if any(x in title for x in ['entr√©e', 'salade', 'soupe']): return 'entree'
        return 'plat'

    def _classify_cuisine(self, title):
        title = title.lower()
        if 'pizza' in title or 'pasta' in title: return 'italienne'
        return 'fran√ßaise'

    def _extract_ingredients(self, soup):
        names = soup.select('span[class*="IngredientName"]')
        qtys = soup.select('span[class*="IngredientQuantity"]')
        return [f"{q.text.strip()} {n.text.strip()}" for n, q in zip(names, qtys)]

    def _extract_steps(self, soup):
        items = soup.select('div[class*="RecipeStep"]')
        return [item.get_text().strip() for item in items]

    def scrape_all(self, categories=['plat-principal'], max_pages=1):
        all_results = []
        for cat in categories:
            print(f"\nüìÇ Cat√©gorie : {cat}")
            urls = self.get_recipe_urls(cat, max_pages)
            for i, url in enumerate(urls, 1):
                print(f"  [{i}/{len(urls)}] Scraping...")
                data = self.extract_recipe_details(url)
                if data: all_results.append(data)
                time.sleep(random.uniform(0.5, 1.5))
        return all_results

    def save_to_csv(self, recipes, filename='marmiton.csv'):
        if not recipes:
            print("Aucune donn√©e √† sauvegarder.")
            return pd.DataFrame()
        df = pd.DataFrame(recipes)
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"\n‚úÖ {len(df)} recettes sauvegard√©es dans {filename}")
        return df

if __name__ == "__main__":
    scraper = MarmitonScraper()
    
    print("üöÄ D√©marrage du scraping Marmiton...")
    
    # 1. Lancer le scraping
    recipes = scraper.scrape_all(
        categories=['plat-principal', 'entree', 'dessert'],
        max_pages=10  
    )
    
    # 2. Sauvegarder ET r√©cup√©rer le DataFrame pour les stats
    df = scraper.save_to_csv(recipes)
    
    # 3. Afficher les statistiques uniquement si on a des donn√©es
    if not df.empty:
        print(f"\nüìä Statistiques:")
        print(f"  - Total recettes collect√©es: {len(recipes)}")
        
        
        if 'type_recette' in df.columns:
            print(f"  - Types: {df['type_recette'].value_counts().to_dict()}")
        if 'cuisine' in df.columns:
            print(f"  - Cuisines: {df['cuisine'].value_counts().to_dict()}")
        
        print(f"  - Top 5 premi√®res recettes :\n{df['titre'].head()}")
    else:
        print("‚ö† Aucune donn√©e n'a pu √™tre extraite.")

üöÄ D√©marrage du scraping Marmiton...

üìÇ Cat√©gorie : plat-principal
‚úì Page 1 - 21 URLs trouv√©es
‚úì Page 2 - 21 URLs trouv√©es
‚úì Page 3 - 21 URLs trouv√©es
‚úì Page 4 - 21 URLs trouv√©es
‚úì Page 5 - 21 URLs trouv√©es
‚úì Page 6 - 21 URLs trouv√©es
‚úì Page 7 - 21 URLs trouv√©es
‚úì Page 8 - 21 URLs trouv√©es
‚úì Page 9 - 21 URLs trouv√©es
‚úì Page 10 - 21 URLs trouv√©es
  [1/156] Scraping...
  [2/156] Scraping...
  [3/156] Scraping...
  [4/156] Scraping...
  [5/156] Scraping...
  [6/156] Scraping...
  [7/156] Scraping...
  [8/156] Scraping...
  [9/156] Scraping...
  [10/156] Scraping...
  [11/156] Scraping...
  [12/156] Scraping...
  [13/156] Scraping...
  [14/156] Scraping...
  [15/156] Scraping...
  [16/156] Scraping...
  [17/156] Scraping...
  [18/156] Scraping...
  [19/156] Scraping...
  [20/156] Scraping...
  [21/156] Scraping...
  [22/156] Scraping...
  [23/156] Scraping...
  [24/156] Scraping...
  [25/156] Scraping...
  [26/156] Scraping...
  [27/156] Scraping...
  [