In [1]:
import pickle
import pandas as pd
import re

In [4]:
class CustomerDataExtractor:
    
    CATEGORY_MAP = {1: 'Electronics', 2: 'Apparel', 3: 'Books', 4: 'Home Goods'}

    def __init__(self, orders_file: str, vip_file: str):
        self.orders_file = orders_file
        self.vip_file = vip_file

    def load_data(self):
        with open(self.orders_file, 'rb') as f:
            customer_list = pickle.load(f)

        with open(self.vip_file, 'r') as f:
            vip_set = set()
            for line in f:
                line = line.strip()
                if line.isdigit():
                    vip_set.add(int(line))

        return customer_list, vip_set

    def clean_price(self, price_val):
        if isinstance(price_val, str):
            digits = re.sub(r'[^\d.]', '', price_val)
            if digits:
                try:
                    return float(digits)
                except ValueError:
                    return 0.0
            else:
                return 0.0
        else:
            try:
                return float(price_val)
            except (TypeError, ValueError):
                return 0.0

    def clean_quantity(self, qty_val):
        try:
            return int(qty_val)
        except (TypeError, ValueError):
            return 0

    def transform(self) -> pd.DataFrame:
        customer_list, vip_set = self.load_data()
        rows = []

        for customer in customer_list:
            try:
                cust_id = int(customer.get('id'))
            except (TypeError, ValueError):
                continue

            name = customer.get('name', '')
            reg_date = pd.to_datetime(customer.get('registration_date'), errors='coerce')
            is_vip = cust_id in vip_set

            for order in customer.get('orders', []):
                try:
                    order_id = int(order.get('order_id'))
                except (TypeError, ValueError):
                    continue

                order_date = pd.to_datetime(order.get('order_date'), errors='coerce')
                item_totals = []
                for item in order.get('items', []):
                    unit = self.clean_price(item.get('price'))
                    qty = self.clean_quantity(item.get('quantity'))
                    item_totals.append(unit * qty)

                if item_totals:
                    order_total = sum(item_totals)
                else:
                    order_total = 0.0

                for item in order.get('items', []):
                    try:
                        prod_id = int(item.get('item_id'))
                    except (TypeError, ValueError):
                        continue

                    unit_price = self.clean_price(item.get('price'))
                    item_qty = self.clean_quantity(item.get('quantity'))
                    total_price = unit_price * item_qty

                    if order_total != 0:
                        pct = (total_price / order_total) * 100
                    else:
                        pct = 0.0

                    category = self.CATEGORY_MAP.get(item.get('category'), 'Misc')

                    rows.append({
                        'customer_id': cust_id,
                        'customer_name': name,
                        'registration_date': reg_date,
                        'is_vip': is_vip,
                        'order_id': order_id,
                        'order_date': order_date,
                        'product_id': prod_id,
                        'product_name': item.get('product_name', ''),
                        'category': category,
                        'unit_price': unit_price,
                        'item_quantity': item_qty,
                        'total_item_price': total_price,
                        'total_order_value_percentage': pct
                    })

        df = pd.DataFrame(rows)
        df = df.sort_values(['customer_id', 'order_id', 'product_id']).reset_index(drop=True)
        return df

In [5]:
if __name__ == '__main__':
    extractor = CustomerDataExtractor('customer_orders.pkl', 'vip_customers.txt')
    final_df = extractor.transform()
    final_df.to_csv('customer_orders_final.csv', index=False)