<a href="https://colab.research.google.com/github/ludoveltz/hackaton_projet_W3D5/blob/main/Analyser_fiche_produit_LV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Installation des d√©pendances
!pip install transformers
!pip install torch
!pip install pandas
!pip install numpy



In [15]:
class SolweigIzarLVAnalyzer:
    def __init__(self):
        # Patterns d'extraction am√©lior√©s
        self.patterns = {
            "nom": r"^(.*?)(?:\d+(?:\s*\d+)*[,\s]*00\s*‚Ç¨|$)",
            "prix": r"(\d+(?:\s|\u202f)*\d*)(?:[,\s]*00)?\s*‚Ç¨",
            "dimensions": r"(\d+(?:\.\d+)?)\s*x\s*(\d+(?:\.\d+)?)\s*x\s*(\d+(?:\.\d+)?)\s*cm",
            "poids": r"(?:Poids\s*:\s*)?(\d+(?:[,.]\d+)?)\s*kg",
            "bandouliere_longueur": r"Longueur.*?:\s*(\d+)(?:\s*-\s*(\d+))?\s*cm",
            "poches": r"(?:Poche[s]?\s+[^‚Ä¢]+)",
            "reference": r'(?:R√©f√©rence|Ref\.?)\s*:?\s*([A-Z]\d{5}|[A-Z]{2}\d{4})',
            "url": r"https://[^\s]+",
            "collection": r"(?:Collection|√âdition)\s+([^‚Ä¢\n,]+)",
            "collaboration": r"(?:x|√ó)\s+([^‚Ä¢\n,]+)"
        }

        # Dictionnaires de reconnaissance
        self.keywords = {
            "doublure": ["doublure en tissu", "doublure en microfibre", "doublure textile"],
            "metal": ["m√©tal dor√©", "palladium", "ruth√©nium", "m√©tal argent√©"],
            "fermeture": ["fermeture √† glissi√®re", "zip", "fermeture magn√©tique"],
            "securite": ["cadenas", "serrure S-lock", "serrure"],
            "poignees": ["poign√©es en cuir", "poign√©es roul√©es", "anses"],
            "toile": ["Toile Monogram", "Damier Azur", "Damier Graphite"],
            "cuir": ["cuir de vache", "cuir naturel", "cuir grain√©", "cuir roul√©"],
            "occasions": ["quotidien", "ville", "voyage", "soir√©e"]
        }

    def extract_clean_value(self, text, pattern_list):
        """Extrait la premi√®re correspondance des patterns donn√©s"""
        for pattern in pattern_list:
            if match := re.search(pattern, text, re.I):
                return match.group(0).strip()
        return None

    def split_features(self, text):
        """S√©pare les caract√©ristiques par le symbole ‚Ä¢"""
        return [item.strip() for item in text.split('‚Ä¢') if item.strip()]

    def extract_product_info(self, text):
        text = ' '.join(text.split())  # Normalisation du texte
        features = self.split_features(text)

        data = {
            'nom': None,
            'reference': None,
            'prix': None,
            'collection': None,
            'collaboration': None,
            'url': None,
            'longueur': None,
            'hauteur': None,
            'largeur': None,
            'poids': None,
            'toile_principale': None,
            'cuir': None,
            'doublure': None,
            'metal': None,
            'type_fermeture': [],
            'nombre_poches': None,
            'types_poches': [],
            'bandouliere_presence': False,
            'bandouliere_amovible': False,
            'bandouliere_ajustable': False,
            'bandouliere_longueur_min': None,
            'bandouliere_longueur_max': None,
            'poignees_type': None,
            'poignees_nombre': None,
            'elements_securite': [],
            'contenance': [],
            'pays_production': None,
            'type_sac': None,
            'occasions': [],
            'serie_limitee': False
        }

        # Extraction du nom (premi√®re partie avant le prix)
        if nom_match := re.match(self.patterns["nom"], text):
            data['nom'] = nom_match.group(1).strip()

        # Prix
        if prix_match := re.search(self.patterns["prix"], text):
            data['prix'] = float(prix_match.group(1).replace(' ', '').replace('\u202f', ''))

        # Dimensions
        if dim_match := re.search(self.patterns["dimensions"], text):
            data['longueur'] = float(dim_match.group(1))
            data['hauteur'] = float(dim_match.group(2))
            data['largeur'] = float(dim_match.group(3))

        # Traitement des caract√©ristiques individuelles
        for feature in features:
            feature = feature.strip().lower()

            # Doublure
            if "doublure" in feature:
                data['doublure'] = self.extract_clean_value(feature, self.keywords["doublure"])

            # M√©tal
            elif "m√©tal" in feature:
                data['metal'] = self.extract_clean_value(feature, self.keywords["metal"])

            # Fermeture
            elif "fermeture" in feature:
                if fermeture := self.extract_clean_value(feature, self.keywords["fermeture"]):
                    data['type_fermeture'].append(fermeture)

            # Poches
            elif "poche" in feature:
                data['types_poches'].append(feature)
                data['nombre_poches'] = len(data['types_poches'])

            # Bandouli√®re
            elif "bandouli√®re" in feature:
                data['bandouliere_presence'] = True
                data['bandouliere_amovible'] = "amovible" in feature
                data['bandouliere_ajustable'] = "ajustable" in feature
                if longueur_match := re.search(self.patterns["bandouliere_longueur"], feature):
                    data['bandouliere_longueur_min'] = float(longueur_match.group(1))
                    if longueur_match.group(2):
                        data['bandouliere_longueur_max'] = float(longueur_match.group(2))

            # Contenance
            elif "peut contenir" in feature:
                data['contenance'] = [item.strip() for item in feature.split(':')[1].split(',')]

            # √âl√©ments de s√©curit√©
            for securite in self.keywords["securite"]:
                if securite.lower() in feature:
                    data['elements_securite'].append(securite)

        # Collection et collaboration
        if coll_match := re.search(self.patterns["collection"], text):
            data['collection'] = coll_match.group(1).strip()
        if collab_match := re.search(self.patterns["collaboration"], text):
            data['collaboration'] = collab_match.group(1).strip()

        # Pays de production
        if "france" in text.lower():
            data['pays_production'] = "France"

        return data

class SolweigIzarDataManager:
    def __init__(self):
        drive.mount('/content/drive')
        self.base_path = "/content/drive/MyDrive/Solweig_Izar/LV_DNA_Dataset"
        self.main_csv = "lv_dna_master.csv"
        os.makedirs(self.base_path, exist_ok=True)

    def update_dataset(self, new_data):
        df = pd.DataFrame([new_data])
        csv_path = f"{self.base_path}/{self.main_csv}"

        if os.path.exists(csv_path):
            existing_df = pd.read_csv(csv_path)
            df = pd.concat([existing_df, df], ignore_index=True)

        df.to_csv(csv_path, index=False)
        print(f"‚úÖ Dataset mis √† jour: {csv_path}")
        return df

def process_new_products():
    analyzer = SolweigIzarLVAnalyzer()
    data_manager = SolweigIzarDataManager()

    uploaded = files.upload()

    for filename, content in uploaded.items():
        if filename.endswith('.pdf'):
            print(f"\nüîÑ Analyse de {filename}...")
            reader = PdfReader(io.BytesIO(content))
            text = " ".join([page.extract_text() for page in reader.pages])

            product_data = analyzer.extract_product_info(text)

            # Affichage structur√© des r√©sultats
            print("\nüìä Donn√©es extraites:")
            for category, value in product_data.items():
                if value:  # Affiche uniquement les valeurs non nulles
                    print(f"{category}: {value}")

            dataset = data_manager.update_dataset(product_data)

    return dataset

# Lancement
dataset = process_new_products()




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Saving Sac Speedy BandouliSÃåre 30.pdf to Sac Speedy BandouliSÃåre 30.pdf
Saving Pochette M‚Äötis.pdf to Pochette M‚Äötis.pdf

üîÑ Analyse de Sac Speedy BandouliSÃåre 30.pdf...

üìä Donn√©es extraites:
nom: Sac Speedy Bandouli√®re
prix: 1600.0
collaboration: voyageurs des ann√©es 30. Le mod√®le d√©voile un design caract√©ristique
longueur: 30.0
hauteur: 21.0
largeur: 17.0
doublure: doublure en tissu
metal: m√©tal dor√©
type_fermeture: ['fermeture √† glissi√®re']
nombre_poches: 1
types_poches: ['poche plate int√©rieure']
bandouliere_presence: True
elements_securite: ['cadenas', 'cadenas']
pays_production: France
‚úÖ Dataset mis √† jour: /content/drive/MyDrive/Solweig_Izar/LV_DNA_Dataset/lv_dna_master.csv

üîÑ Analyse de Pochette M‚Äötis.pdf...

üìä Donn√©es extraites:
nom: Pochette M√©tis
prix: 2200.0
collaboration: 19 x 7 cm (Longueur x Hauteur x Largeur)
longueur: 25.0
hauteur: 19.0
largeur: 7.0
doublure: doublure en tissu
metal: m√©tal dor√©
type_fermeture: ['fermeture √† glissi√