In [None]:
from google.colab import drive
import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime
from typing import Dict, List

drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
data_path = '/content/drive/MyDrive/maxidom_contest/export_fid_spool_clear_regexp5-utf8.xml'

In [None]:
class MaxidomXMLParser:
    def __init__(self, xml_source):
        import os
        import io

        self.xml_source = xml_source
        self.is_file = False
        self.cleaned_file = None

        if isinstance(xml_source, str):
            if os.path.isfile(xml_source):
                self.is_file = True
                print("Подготовка файла для парсинга...")
                with open(xml_source, 'r', encoding='utf-8') as f:
                    content = f.read()

                content = content.lstrip('\ufeff').lstrip()

                self.cleaned_file = io.StringIO(content)
                self.root = None
            else:
                xml_content = xml_source.lstrip('\ufeff').lstrip()
                self.root = ET.fromstring(xml_content)

    def _get_root(self):
        if self.root is None and self.is_file:
            try:
                self.cleaned_file.seek(0)
                tree = ET.parse(self.cleaned_file)
                self.root = tree.getroot()
                return self.root
            except Exception as e:
                print(f"Ошибка парсинга: {e}")
                raise
        return self.root

    def parse_categories(self) -> pd.DataFrame:
        categories_data = []

        if self.is_file:
            self.cleaned_file.seek(0)
            try:
                for event, elem in ET.iterparse(self.cleaned_file, events=('end',)):
                    if elem.tag == 'category':
                        name_elem = elem.find('NAME')
                        cat_data = {
                            'category_id': elem.get('id'),
                            'parent_id': elem.get('parentId', None),
                            'name': name_elem.text if name_elem is not None else None
                        }
                        categories_data.append(cat_data)
                        elem.clear()
            except Exception as e:
                print(f"Ошибка итеративного парсинга категорий: {e}")
                root = self._get_root()
                shop = root.find('shop')
                categories = shop.find('categories') if shop is not None else None

                if categories is not None:
                    for category in categories.findall('category'):
                        cat_data = {
                            'category_id': category.get('id'),
                            'parent_id': category.get('parentId', None),
                            'name': category.find('NAME').text if category.find('NAME') is not None else None
                        }
                        categories_data.append(cat_data)
        else:
            root = self._get_root()
            shop = root.find('shop')
            categories = shop.find('categories') if shop is not None else None

            if categories is not None:
                for category in categories.findall('category'):
                    cat_data = {
                        'category_id': category.get('id'),
                        'parent_id': category.get('parentId', None),
                        'name': category.find('NAME').text if category.find('NAME') is not None else None
                    }
                    categories_data.append(cat_data)

        return pd.DataFrame(categories_data)

    def parse_offers(self, chunk_size: int = None) -> pd.DataFrame:
        offers_data = []

        if self.is_file:
            self.cleaned_file.seek(0)
            count = 0
            try:
                for event, elem in ET.iterparse(self.cleaned_file, events=('end',)):
                    if elem.tag == 'offer':
                        offer_data = self._parse_offer_element(elem)
                        offers_data.append(offer_data)
                        elem.clear()

                        count += 1
                        if chunk_size and count % chunk_size == 0:
                            print(f"Обработано {count} товаров...")
            except Exception as e:
                print(f"Ошибка итеративного парсинга товаров: {e}")
                print("Используем обычный парсинг...")
                root = self._get_root()
                shop = root.find('shop')
                offers = shop.find('offers') if shop is not None else None

                if offers is not None:
                    for offer in offers.findall('offer'):
                        offer_data = self._parse_offer_element(offer)
                        offers_data.append(offer_data)
        else:
            root = self._get_root()
            shop = root.find('shop')
            offers = shop.find('offers') if shop is not None else None

            if offers is not None:
                for offer in offers.findall('offer'):
                    offer_data = self._parse_offer_element(offer)
                    offers_data.append(offer_data)

        return pd.DataFrame(offers_data)

    def _parse_offer_element(self, offer):
        """Парсинг одного элемента offer"""
        offer_data = {
            'offer_id': offer.get('id'),
            'available': offer.get('available'),
            'url': self._get_text(offer, 'url'),
            'price': self._safe_float(self._get_text(offer, 'price', '0')),
            'currency': self._get_text(offer, 'currencyId'),
            'category_id': self._get_text(offer, 'categoryId'),
            'picture': self._get_text(offer, 'picture'),
            'sales_notes': self._get_text(offer, 'sales_notes'),
            'delivery': self._get_text(offer, 'delivery'),
            'local_delivery_cost': self._safe_float(self._get_text(offer, 'local_delivery_cost', '0')),
            'name': self._get_text(offer, 'name'),
            'vendor': self._get_text(offer, 'vendor'),
            'country_of_origin': self._get_text(offer, 'country_of_origin'),
            'description': self._get_text(offer, 'description'),
            'market_description': self._get_text(offer, 'market_description'),
            'weight': self._safe_float(self._get_text(offer, 'weight', '0'))
        }

        params = {}
        for param in offer.findall('param'):
            param_name = param.get('name')
            param_value = param.text
            param_unit = param.get('unit', '')

            if param_unit:
                params[param_name] = f"{param_value} {param_unit}"
            else:
                params[param_name] = param_value

        offer_data['params'] = params
        return offer_data

    def parse_offers_with_params(self, chunk_size: int = None) -> pd.DataFrame:
        df_offers = self.parse_offers(chunk_size=chunk_size)

        if df_offers.empty:
            return df_offers

        all_params = set()
        for params in df_offers['params']:
            if isinstance(params, dict):
                all_params.update(params.keys())


        params_dict = {}
        for param_name in all_params:
            params_dict[f'param_{param_name}'] = df_offers['params'].apply(
                lambda x: x.get(param_name) if isinstance(x, dict) else None
            )

        df_params = pd.DataFrame(params_dict, index=df_offers.index)

        df_offers = df_offers.drop('params', axis=1)
        df_offers = pd.concat([df_offers, df_params], axis=1)

        return df_offers

    def parse_promos(self, promo_xml_file: str = None, promo_xml_string: str = None) -> pd.DataFrame:
        promos_data = []

        if promo_xml_file:
            import os
            if os.path.isfile(promo_xml_file):
                with open(promo_xml_file, 'r', encoding='utf-8') as f:
                    promo_xml_string = f.read().lstrip('\ufeff').lstrip()

        if promo_xml_string:
            promo_root = ET.fromstring(promo_xml_string)
        else:
            return pd.DataFrame()

        for promo in promo_root.findall('promo'):
            promo_id = promo.get('id')
            promo_type = promo.get('type')
            start_date = self._get_text(promo, 'start-date')
            end_date = self._get_text(promo, 'end-date')
            description = self._get_text(promo, 'description')
            url = self._get_text(promo, 'url')

            purchase = promo.find('purchase')
            if purchase is not None:
                for product in purchase.findall('product'):
                    discount_elem = product.find('discount-price')
                    promo_data = {
                        'promo_id': promo_id,
                        'promo_type': promo_type,
                        'start_date': self._parse_date(start_date),
                        'end_date': self._parse_date(end_date),
                        'description': description,
                        'url': url,
                        'offer_id': product.get('offer-id'),
                        'discount_price': self._safe_float(discount_elem.text if discount_elem is not None else '0'),
                        'currency': discount_elem.get('currency') if discount_elem is not None else None
                    }
                    promos_data.append(promo_data)

        return pd.DataFrame(promos_data)

    def get_shop_info(self) -> Dict:
        root = self._get_root()
        shop = root.find('shop') if root is not None else None

        return {
            'name': self._get_text(shop, 'name'),
            'company': self._get_text(shop, 'company'),
            'url': self._get_text(shop, 'url'),
            'catalog_date': root.get('date') if root is not None else None
        }

    def _get_text(self, element, tag: str, default: str = '') -> str:
        """Безопасное извлечение текста из элемента"""
        if element is None:
            return default
        child = element.find(tag) if tag else element
        if child is None:
            return default
        return child.text if child.text else default

    def _safe_float(self, value: str) -> float:
        """Безопасное преобразование в float"""
        try:
            return float(value) if value else 0.0
        except:
            return 0.0

    def _parse_date(self, date_str: str):
        """Парсинг даты из строки"""
        if not date_str:
            return None
        try:
            return datetime.strptime(date_str, '%Y-%m-%d %H:%M')
        except:
            return None

    def parse_promos_from_main_xml(self) -> pd.DataFrame:
        promos_data = []

        if self.is_file:
            self.cleaned_file.seek(0)
            try:
                for event, elem in ET.iterparse(self.cleaned_file, events=('end',)):
                    if elem.tag == 'promo':
                        promo_id = elem.get('id')
                        promo_type = elem.get('type')
                        start_date = self._get_text(elem, 'start-date')
                        end_date = self._get_text(elem, 'end-date')
                        description = self._get_text(elem, 'description')
                        url = self._get_text(elem, 'url')

                        purchase = elem.find('purchase')
                        if purchase is not None:
                            for product in purchase.findall('product'):
                                discount_elem = product.find('discount-price')
                                promo_data = {
                                    'promo_id': promo_id,
                                    'promo_type': promo_type,
                                    'start_date': self._parse_date(start_date),
                                    'end_date': self._parse_date(end_date),
                                    'description': description,
                                    'url': url,
                                    'offer_id': product.get('offer-id'),
                                    'discount_price': self._safe_float(discount_elem.text if discount_elem is not None else '0'),
                                    'currency': discount_elem.get('currency') if discount_elem is not None else None
                                }
                                promos_data.append(promo_data)
                        elem.clear()
            except Exception as e:
                print(f"Ошибка итеративного парсинга промо: {e}")
                root = self._get_root()
                promos = root.find('promos')

                if promos is not None:
                    for promo in promos.findall('promo'):
                        promo_id = promo.get('id')
                        promo_type = promo.get('type')
                        start_date = self._get_text(promo, 'start-date')
                        end_date = self._get_text(promo, 'end-date')
                        description = self._get_text(promo, 'description')
                        url = self._get_text(promo, 'url')

                        purchase = promo.find('purchase')
                        if purchase is not None:
                            for product in purchase.findall('product'):
                                discount_elem = product.find('discount-price')
                                promo_data = {
                                    'promo_id': promo_id,
                                    'promo_type': promo_type,
                                    'start_date': self._parse_date(start_date),
                                    'end_date': self._parse_date(end_date),
                                    'description': description,
                                    'url': url,
                                    'offer_id': product.get('offer-id'),
                                    'discount_price': self._safe_float(discount_elem.text if discount_elem is not None else '0'),
                                    'currency': discount_elem.get('currency') if discount_elem is not None else None
                                }
                                promos_data.append(promo_data)
        else:
            root = self._get_root()
            promos = root.find('promos')

            if promos is not None:
                for promo in promos.findall('promo'):
                    promo_id = promo.get('id')
                    promo_type = promo.get('type')
                    start_date = self._get_text(promo, 'start-date')
                    end_date = self._get_text(promo, 'end-date')
                    description = self._get_text(promo, 'description')
                    url = self._get_text(promo, 'url')

                    purchase = promo.find('purchase')
                    if purchase is not None:
                        for product in purchase.findall('product'):
                            discount_elem = product.find('discount-price')
                            promo_data = {
                                'promo_id': promo_id,
                                'promo_type': promo_type,
                                'start_date': self._parse_date(start_date),
                                'end_date': self._parse_date(end_date),
                                'description': description,
                                'url': url,
                                'offer_id': product.get('offer-id'),
                                'discount_price': self._safe_float(discount_elem.text if discount_elem is not None else '0'),
                                'currency': discount_elem.get('currency') if discount_elem is not None else None
                            }
                            promos_data.append(promo_data)

        return pd.DataFrame(promos_data)

    def get_stats(self, df: pd.DataFrame) -> Dict:
        if df.empty:
            return {'total_rows': 0}

        stats = {
            'total_rows': len(df),
            'total_columns': len(df.columns),
            'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024 / 1024
        }

        param_cols = [col for col in df.columns if col.startswith('param_')]
        if param_cols:
            stats['param_columns'] = len(param_cols)
            stats['params_with_data'] = {}
            for col in param_cols:
                non_null = df[col].notna().sum()
                if non_null > 0:
                    stats['params_with_data'][col.replace('param_', '')] = non_null

        return stats

    def clean_empty_params(self, df: pd.DataFrame, threshold: float = 0.01) -> pd.DataFrame:
        param_cols = [col for col in df.columns if col.startswith('param_')]

        cols_to_drop = []
        for col in param_cols:
            fill_rate = df[col].notna().sum() / len(df)
            if fill_rate < threshold:
                cols_to_drop.append(col)

        print(f"Удалено {len(cols_to_drop)} колонок с заполненностью < {threshold*100}%")
        return df.drop(columns=cols_to_drop)

    def get_params_by_category(self, df_offers: pd.DataFrame, df_categories: pd.DataFrame = None) -> pd.DataFrame:
        param_cols = [col for col in df_offers.columns if col.startswith('param_')]

        category_stats = []
        for cat_id in df_offers['category_id'].unique():
            cat_offers = df_offers[df_offers['category_id'] == cat_id]

            non_empty_params = []
            for col in param_cols:
                if cat_offers[col].notna().any():
                    non_empty_params.append(col.replace('param_', ''))

            stat = {
                'category_id': cat_id,
                'offers_count': len(cat_offers),
                'params_count': len(non_empty_params),
                'params': ', '.join(non_empty_params[:5])
            }

            if df_categories is not None:
                cat_name = df_categories[df_categories['category_id'] == cat_id]['name'].values
                if len(cat_name) > 0:
                    stat['category_name'] = cat_name[0]

            category_stats.append(stat)

        return pd.DataFrame(category_stats).sort_values('offers_count', ascending=False)

In [None]:
parser = MaxidomXMLParser(data_path)

Подготовка файла для парсинга...


In [None]:
shop_info = parser.get_shop_info()
print("Информация о магазине:")
print(shop_info)
print()

Информация о магазине:
{'name': 'www.maxidom.ru', 'company': 'www.maxidom.ru', 'url': 'https://www.maxidom.ru', 'catalog_date': '2025-10-29 20:23'}



In [None]:
df_categories = parser.parse_categories()
print(f"Категории: {len(df_categories)} записей")
print(df_categories.head())
print()

Категории: 3231 записей
  category_id parent_id                             name
0       22906      None         Товары для сада и отдыха
1       22920     22906                  Садовая техника
2       22937     22920             Садовые измельчители
3       34677     22920                          Райдеры
4       22943     22920  Садовые пылесосы и воздуходувки



In [None]:
df_offers = parser.parse_offers()
print(f"Товары: {len(df_offers)} записей")
print(df_offers.head())
print()

Товары: 64490 записей
   offer_id available                                                url  \
0   5448030      true  https://www.maxidom.ru/catalog/bumaga-dlya-ofi...   
1   3799878      true  https://www.maxidom.ru/catalog/sredstva-posle-...   
2   4167573      true  https://www.maxidom.ru/catalog/polipropilen-so...   
3   4191207      true  https://www.maxidom.ru/catalog/semena-tsvetov/...   
4  12213432      true  https://www.maxidom.ru/catalog/semena-tsvetov/...   

    price currency category_id  \
0   379.0      RUB       29663   
1   629.0      RUB       30487   
2  1294.0      RUB       27832   
3    32.0      RUB       24074   
4    28.0      RUB       24074   

                                             picture  \
0  https://www.maxidom.ru/upload/iblock/d01/d018b...   
1  https://www.maxidom.ru/upload/iblock/f5a/f5acb...   
2  https://www.maxidom.ru/upload/iblock/93c/93cb1...   
3  https://www.maxidom.ru/upload/iblock/0f5/0f5e3...   
4  https://www.maxidom.ru/upload/ibl

In [None]:
df_offers = parser.parse_offers(chunk_size=5000)
print(f"Товары: {len(df_offers)} записей")
print(df_offers.head())
print()

Обработано 5000 товаров...
Обработано 10000 товаров...
Обработано 15000 товаров...
Обработано 20000 товаров...
Обработано 25000 товаров...
Обработано 30000 товаров...
Обработано 35000 товаров...
Обработано 40000 товаров...
Обработано 45000 товаров...
Обработано 50000 товаров...
Обработано 55000 товаров...
Обработано 60000 товаров...
Товары: 64490 записей
   offer_id available                                                url  \
0   5448030      true  https://www.maxidom.ru/catalog/bumaga-dlya-ofi...   
1   3799878      true  https://www.maxidom.ru/catalog/sredstva-posle-...   
2   4167573      true  https://www.maxidom.ru/catalog/polipropilen-so...   
3   4191207      true  https://www.maxidom.ru/catalog/semena-tsvetov/...   
4  12213432      true  https://www.maxidom.ru/catalog/semena-tsvetov/...   

    price currency category_id  \
0   379.0      RUB       29663   
1   629.0      RUB       30487   
2  1294.0      RUB       27832   
3    32.0      RUB       24074   
4    28.0      R

In [None]:
print("Разворачивание параметров...")
df_offers_expanded = parser.parse_offers_with_params(chunk_size=5000)
print(f"Товары с параметрами: {len(df_offers_expanded)} записей")
print(df_offers_expanded.head())
print()

Разворачивание параметров...
Обработано 5000 товаров...
Обработано 10000 товаров...
Обработано 15000 товаров...
Обработано 20000 товаров...
Обработано 25000 товаров...
Обработано 30000 товаров...
Обработано 35000 товаров...
Обработано 40000 товаров...
Обработано 45000 товаров...
Обработано 50000 товаров...
Обработано 55000 товаров...
Обработано 60000 товаров...
Товары с параметрами: 64490 записей
   offer_id available                                                url  \
0   5448030      true  https://www.maxidom.ru/catalog/bumaga-dlya-ofi...   
1   3799878      true  https://www.maxidom.ru/catalog/sredstva-posle-...   
2   4167573      true  https://www.maxidom.ru/catalog/polipropilen-so...   
3   4191207      true  https://www.maxidom.ru/catalog/semena-tsvetov/...   
4  12213432      true  https://www.maxidom.ru/catalog/semena-tsvetov/...   

    price currency category_id  \
0   379.0      RUB       29663   
1   629.0      RUB       30487   
2  1294.0      RUB       27832   
3    32

In [None]:
print("Парсинг промо-акций...")
df_promos = parser.parse_promos_from_main_xml()
print(f"Промо-акции: {len(df_promos)} записей")
print(df_promos.head())
print()

Парсинг промо-акций...
Промо-акции: 64437 записей
  promo_id  promo_type start_date   end_date      description  \
0        1  bonus card 2020-02-01 2024-04-15  Скидка по карте   
1        1  bonus card 2020-02-01 2024-04-15  Скидка по карте   
2        1  bonus card 2020-02-01 2024-04-15  Скидка по карте   
3        1  bonus card 2020-02-01 2024-04-15  Скидка по карте   
4        1  bonus card 2020-02-01 2024-04-15  Скидка по карте   

                             url  offer_id  discount_price currency  
0  https://www.maxidom.ru/loyal/  10782143          206.91      RUB  
1  https://www.maxidom.ru/loyal/   9049296          989.01      RUB  
2  https://www.maxidom.ru/loyal/  10984655          281.16      RUB  
3  https://www.maxidom.ru/loyal/  12340266         1475.10      RUB  
4  https://www.maxidom.ru/loyal/  11142449          513.81      RUB  



In [None]:
df_categories.to_csv('categories.csv', index=False, encoding='utf-8-sig')
df_promos.to_csv('promos.csv', index=False, encoding='utf-8-sig')
df_offers_expanded.to_csv('offers_expanded.csv', index=False, encoding='utf-8-sig')