In [1]:
import os
import json
import base64
import re
import shutil
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# Télécharger les ressources nécessaires pour NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>
[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


False

In [3]:
class TextCleaner:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
    
    def clean_text(self, text):
        text = re.sub(r'\s+', ' ', text).strip()  # Supprimer les espaces inutiles
        text = re.sub(r'\d+', '', text)  # Supprimer les nombres
        text = text.lower()

        doc = self.nlp(text)
        tokens = [token.lemma_ for token in doc if not token.is_stop]  # Lemmatisation et suppression des stopwords
        
        return " ".join(tokens)

In [4]:
"""Classe représentant un post du forum, incluant le titre, l'URL, 
    la description et les solutions associées."""
class Post:
    def __init__(self, title, url, description, solutions):
        self.title = title
        self.url = url
        self.description = description
        self.solutions = solutions

In [5]:
import json
import os
import base64
import hashlib
import re

saved_images = {}  # Dictionary to avoid duplicate images

class ImageProcessor:
    def __init__(self, output_dir):
        """Initialize the image processor and create the output directory."""
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)

    def save_images(self, base64_images, post_id, sol_idx):
        """Decode and save Base64 images as unique PNG files."""
        filenames = []
        for img_idx, base64_img in enumerate(base64_images):
            try:
                if not base64_img.strip():  # Ignore empty strings
                    continue  

                # Generate a unique hash to avoid duplicates
                img_hash = hashlib.md5(base64_img.encode()).hexdigest()[:8]
                img_filename = f"post{post_id}_sol{sol_idx}_img{img_idx}_{img_hash}.png"

                # Avoid duplicate storage
                if base64_img in saved_images:
                    filenames.append(saved_images[base64_img])
                    continue

                # Decode and save image
                img_data = base64.b64decode(base64_img)
                img_path = os.path.join(self.output_dir, img_filename)

                with open(img_path, "wb") as img_file:
                    img_file.write(img_data)

                filenames.append(img_filename)
                saved_images[base64_img] = img_filename  # Save reference

            except Exception as e:
                print(f"⚠️ Error decoding image {img_idx}: {e}")

        return filenames

class ForumPostProcessor:
    def __init__(self, input_file, output_file, output_dir):
        """Initialize the ForumPostProcessor."""
        self.input_file = input_file
        self.output_file = output_file
        self.image_processor = ImageProcessor(output_dir)

    def process_text_with_images(self, text, images):
        """
        Replaces each `[ image _ ]` placeholder with numbered `[image_1]`, `[image_2]`, etc.
        Ensures placeholders match image order. If extra images exist, appends them at the end.
        """
        placeholders = re.findall(r"\[ ?image ?_ ?\]", text)
        num_placeholders = len(placeholders)
        num_images = len(images)

        # Step 1: Replace placeholders with indexed image references
        for i in range(min(num_placeholders, num_images)):
            text = re.sub(r"\[ ?image ?_ ?\]", f"[image_{i+1}]", text, 1)

        # Step 2: If extra images exist, append them at the end
        if num_images > num_placeholders:
            extra_images = " ".join(f"[image_{i+1}]" for i in range(num_placeholders, num_images))
            text = text.strip() + " " + extra_images

        return text.strip()

    def process_post(self, post, post_idx):
        """Processes a single forum post."""
        description = post.get("description", {})
        if isinstance(description, str):  
            description = {"text": description, "images": []}

        description_text = description.get("text", "")
        description_images = description.get("images", [])

        extracted_description_images = self.image_processor.save_images(description_images, post_idx, "desc")
        description_text = self.process_text_with_images(description_text, extracted_description_images)

        solutions = []
        for sol_idx, solution in enumerate(post.get("solutions", [])):
            solution_text = solution.get("text", "")
            solution_images = solution.get("images", [])

            extracted_solution_images = self.image_processor.save_images(solution_images, post_idx, sol_idx)
            solution_text = self.process_text_with_images(solution_text, extracted_solution_images)

            solutions.append({"text": solution_text, "images": extracted_solution_images})

        return {
            "title": post["title"],
            "url": post["url"],
            "description": {"text": description_text, "images": extracted_description_images},
            "solutions": solutions
        }

    def process(self):
        """Loads, processes, and saves forum posts."""
        with open(self.input_file, 'r', encoding='utf-8') as f:
            forum_posts = json.load(f)

        processed_posts = [self.process_post(post, idx) for idx, post in enumerate(forum_posts)]

        with open(self.output_file, "w", encoding="utf-8") as f:
            json.dump(processed_posts, f, ensure_ascii=False, indent=4)

        print(f"✅ Processing complete! Data saved to {self.output_file}")

# 🔹 Execute Processing
input_json_file = "/kaggle/input/preprocessed-data/pre-processed_data.json"
output_json_file = "/kaggle/working/processed_forum_data_cleaned.json"
output_dir = "/kaggle/working/images"

processor = ForumPostProcessor(input_json_file, output_json_file, output_dir)
processor.process()


✅ Processing complete! Data saved to /kaggle/working/processed_forum_data_cleaned.json


In [6]:
!zip -r /kaggle/working/images.zip /kaggle/working/images


  adding: kaggle/working/images/ (stored 0%)
  adding: kaggle/working/images/post3050_sol0_img0_91077037.png (deflated 13%)
  adding: kaggle/working/images/post149_sol6_img1_bd20e4a3.png (deflated 6%)
  adding: kaggle/working/images/post2835_sol1_img0_a9b55704.png (deflated 2%)
  adding: kaggle/working/images/post6776_sol1_img0_22e65eea.png (deflated 2%)
  adding: kaggle/working/images/post3166_sol5_img1_d30e6d14.png (deflated 5%)
  adding: kaggle/working/images/post1953_sol4_img0_99d47cac.png (deflated 1%)
  adding: kaggle/working/images/post1643_sol11_img0_91a5aa4f.png (deflated 10%)
  adding: kaggle/working/images/post1355_sol5_img1_af43ce3a.png (deflated 8%)
  adding: kaggle/working/images/post605_sol11_img0_dec8f1a8.png (deflated 2%)
  adding: kaggle/working/images/post1703_sol5_img0_3376735a.png (deflated 6%)
  adding: kaggle/working/images/post2132_sol0_img1_c96b0830.png (deflated 3%)
  adding: kaggle/working/images/post2984_sol3_img0_ab518c3e.png (deflated 5%)
  ad