In [1]:
import pandas as pd
import random
from faker import Faker
from groq import Groq
from tqdm import tqdm
import time
import re
import os
from dotenv import load_dotenv

In [2]:
# === CONFIGURATION ===
fake = Faker()
random.seed(42)
Faker.seed(42)

# Charger vos avis réels
df_real = pd.read_csv("../Data/Raw/pokhara_reviews_real.csv")

# Renommer la colonne "total review" → "total_review"
df_real.rename(columns={'total review': 'total_review'}, inplace=True)

real_reviews = df_real['review'].dropna().astype(str).tolist()

In [3]:
#Charger la clé API
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
# === GROQ API (100% GRATUIT) ===
client = Groq(
    api_key=GROQ_API_KEY
)

In [4]:
#Fonction pour générer les reviews synthétiques 
def generate_synthetic_review_batch(batch_size=10):
    """Génère un batch d'avis avec Groq (Llama3 ultra-rapide)"""
    reviews = []
    
    examples = random.sample(real_reviews, k=min(3, len(real_reviews)))
    example_text = "\n---\n".join([f"Review: {r}" for r in examples])
    
    prompt = f"""You are a tourist writing short reviews (50-120 words) about Pokhara city, Nepal.
                    Focus on: city life, Phewa Lake, Himalayan views, paragliding, boating, local culture.
                    Include a balanced mix of positive, neutral, and negative aspects—some reviews should highlight frustrations, disappointments, or drawbacks alongside (or instead of) the highlights.
                    DO NOT mention hotel names.

                    Style examples from real tourists (note the candid mix of praise and criticism):
                    {example_text}
                    ---

                    Write exactly {batch_size} different reviews about Pokhara.
                    Vary the overall tone: roughly 60% positive, 25% mixed/neutral, 15% negative.
                    Each: 50-80 words, first person, authentic tourist voice.
                    Add 1-2 relevant emojis to about half of the reviews (positive ones can have more emojis; neutral/mixed sparingly; negative reviews usually none). Use emojis naturally, not excessively.
                    Format:
                    Review 1: [text]
                    Review 2: [text]
                    ..."""

    try:
        response = client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.9,
            max_tokens=2000,
            top_p=0.95
        )
        
        output = response.choices[0].message.content
        
        # Parser
        matches = re.findall(r"Review \d+:(.*?)(?=Review \d+:|$)", output, re.DOTALL | re.IGNORECASE)
        
        for review_text in matches:
            cleaned = review_text.strip().replace("\n", " ").strip()
            if 30 < len(cleaned) < 500:
                reviews.append(cleaned)
        
        return reviews
        
    except Exception as e:
        print(f"⚠️ Erreur: {e}")
        return []

In [5]:
import os
import time
import random
import pandas as pd
from tqdm import tqdm
from openai import OpenAI


target_synthetic = 5000
batch_size = 8
output_file = "../Data/Real and synthetic/pokhara_reviews_final.csv"
temp_file = "../Data/Real and synthetic/pokhara_reviews_temp.csv"  # Sauvegarde intermédiaire

# Crée les dossiers
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# === GÉNÉRATION SÉCURISÉE ===
all_synth_reviews = []

# Charge les avis déjà générés (si redémarrage)
if os.path.exists(temp_file):
    df_temp = pd.read_csv(temp_file)
    all_synth_reviews = df_temp['review'].tolist()
    print(f"Reprise : {len(all_synth_reviews)} avis déjà générés")
else:
    print("Démarrage depuis zéro")

print("Génération avec Groq (sauvegarde progressive)...\n")

for i in tqdm(range(len(all_synth_reviews), target_synthetic, batch_size)):
    try:
        revs = generate_synthetic_review_batch(batch_size)  # Ta fonction
        all_synth_reviews.extend(revs)
        
        # === SAUVEGARDE TEMPORAIRE TOUS LES 100 AVIS ===
        if len(all_synth_reviews) % 100 == 0 or len(all_synth_reviews) >= target_synthetic:
            df_temp = pd.DataFrame({
                'review': all_synth_reviews,
                'source': 'synthetic_groq'
            })
            df_temp.to_csv(temp_file, index=False)
            print(f" Sauvegarde intermédiaire : {len(all_synth_reviews)} avis")
        
        time.sleep(0.5)
        
    except Exception as e:
        if "429" in str(e) or "rate limit" in str(e).lower():
            wait = 300  # 5 min
            try:
                wait = int(str(e).split("try again in ")[1].split("s")[0])
            except:
                pass
            print(f" Rate limit ! Attente {wait}s...")
            time.sleep(wait + 10)
            continue
        else:
            print(f" Erreur inattendue : {e}")
            time.sleep(5)
            continue

    if len(all_synth_reviews) >= target_synthetic:
        break

print(f"\n Génération terminée : {len(all_synth_reviews)} avis synthétiques")

# === Création df_synth ===
data_synth = []
for i, review in enumerate(all_synth_reviews[:target_synthetic]):
    data_synth.append({
        'ID': 937 + i,
        'location': 'Pokhara',
        'total_review': random.randint(200, 2500),
        'review': review,
        'source': 'synthetic_groq'
    })
df_synth = pd.DataFrame(data_synth)

# === Fusion finale ===
df_real['source'] = 'real'
df_final = pd.concat([
    df_real[['ID', 'location', 'total_review', 'review', 'source']],
    df_synth
], ignore_index=True)

# === Export final ===
df_final.to_csv(output_file, index=False)
print(f"\n FICHIER FINAL SAUVEGARDÉ : {output_file}")
print(f" Total : {len(df_final)} lignes")

Reprise : 4400 avis déjà générés
Génération avec Groq (sauvegarde progressive)...



 32%|███▏      | 24/75 [00:42<01:26,  1.70s/it]

 Sauvegarde intermédiaire : 4600 avis


 65%|██████▌   | 49/75 [01:58<01:30,  3.49s/it]

 Sauvegarde intermédiaire : 4800 avis


 99%|█████████▊| 74/75 [03:29<00:03,  3.66s/it]

 Sauvegarde intermédiaire : 5000 avis


 99%|█████████▊| 74/75 [03:33<00:02,  2.88s/it]


 Génération terminée : 5000 avis synthétiques

 FICHIER FINAL SAUVEGARDÉ : ../Data/Real and synthetic/pokhara_reviews_final.csv
 Total : 5936 lignes



