In [None]:
# --- Installation des dépendances ---
!pip install beautifulsoup4 requests pandas nltk pyarabic

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
import pyarabic.araby as araby
nltk.download('stopwords')

# --- Fonction de scraping simple ---
def scrape_articles(urls, max_per_site=50):
    texts = []
    for url in urls:
        try:
            headers = {'User-Agent': 'Mozilla/5.0'}
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.text, 'html.parser')
            paragraphs = soup.find_all('p')
            for p in paragraphs[:max_per_site]:
                text = p.get_text(strip=True)
                if len(text) > 50:  # garder seulement les paragraphes significatifs
                    texts.append(text)
        except:
            continue
    return texts

# Sites arabes politiques
urls = [
    "https://www.aljazeera.net/politics/",
    "https://arabic.cnn.com/politics",
    "https://www.hespress.com/politique/"
]

raw_texts = scrape_articles(urls, max_per_site=100)
print(f"Nombre de paragraphes collectés : {len(raw_texts)}")

# --- Attribution manuelle des scores (exemple simplifié) ---
# Ici on simule l'attribution manuelle. En pratique, tu le fais à la main ou semi-automatiquement.
import random
data = []
keywords_high = ["انتخابات", "حكومة", "برلمان", "أخنوش", "بايدن", "فلسطين", "إسرائيل"]
keywords_low = ["كرة", "مباراة", "فن", "موسيقى"]

for text in raw_texts[:200]:  # limiter pour l'exemple
    score = 5.0
    if any(k in text for k in keywords_high):
        score += random.uniform(2, 5)
    if any(k in text for k in keywords_low):
        score -= random.uniform(2, 4)
    score = max(0, min(10, score))
    data.append({"text": text, "score": round(score, 1)})

df = pd.DataFrame(data)
df.to_csv("arabic_politics_dataset.csv", index=False)
print(df.head(10))

# --- Preprocessing pipeline ---
stop_words = set(stopwords.words('arabic'))
stemmer = ISRIStemmer()

def preprocess_arabic(text):
    # Nettoyage
    text = araby.strip_tashkeel(text)
    text = araby.normalize_hamza(text, method="tasheel")
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)  # garder seulement arabe + espaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenization
    tokens = araby.tokenize(text)

    # Stop words + stemming
    tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and len(token) > 2]

    return " ".join(tokens)

df['clean_text'] = df['text'].apply(preprocess_arabic)
df.to_csv("arabic_politics_dataset_clean.csv", index=False)
print("Dataset nettoyé sauvegardé !")
print(df[['text', 'clean_text', 'score']].head())