In [1]:
%pip install faker
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random
from textblob import TextBlob

# Initialize Faker and set seeds for reproducibility
fake = Faker()
np.random.seed(42)
random.seed(42)

# ====================== DATA CONFIGURATION ======================
PRODUCT_CATEGORIES = ['Laptop', 'Smartphone', 'Headphones', 'Tablet', 'Camera']
BRANDS = {
    'Laptop': ['Dell', 'HP', 'Lenovo', 'Apple', 'Asus'],
    'Smartphone': ['Samsung', 'Apple', 'Google', 'OnePlus', 'Xiaomi'],
    'Headphones': ['Sony', 'Bose', 'JBL', 'Apple', 'Sennheiser'],
    'Tablet': ['Apple', 'Samsung', 'Microsoft', 'Amazon', 'Lenovo'],
    'Camera': ['Canon', 'Nikon', 'Sony', 'Fujifilm', 'GoPro']
}

REVIEW_TEMPLATES = {
    "Laptop": {
        "positive": [
            "Absolutely loving my new {brand} {model}! The {feature} works flawlessly, getting {hours}h battery life. Perfect for {use_case}!",
            "Upgraded from my old laptop and the difference is incredible. {spec} handles {task} smoothly. Only wish {minor_issue}.",
            "Best tech purchase this year! The {feature} is revolutionary. {specific_praise}."
        ],
        "negative": [
            "Disappointed with {brand} {model}. {component} started {issue} within {days} days. {consequence}!",
            "Overheats during {task}. Can't even {basic_use} without fan noise. Avoid!",
            "{brand} quality declined. {feature} feels cheap and {specific_issue}."
        ]
    },
    "Smartphone": {
        "positive": [
            "Camera is amazing! {feature} takes stunning {photo_type} photos. Battery lasts {hours}h easily.",
            "Upgrade worth every penny. {specific_praise} makes {common_task} effortless. {casual_remark}",
            "Best {brand} phone yet. {feature} works perfectly. {specific_benefit}."
        ],
        "negative": [
            "Screen developed {issue} after {days}d. {brand} support was {support_exp}. Never again!",
            "Battery dies in {hours}h with light use. {frustration_phrase}",
            "{brand} {model} keeps {recurring_issue}. Should've bought {alternative}."
        ]
    }
}

PRODUCT_SPECS = {
    'Laptop': {
        'models': ['XPS 13', 'Spectre x360', 'ThinkPad X1', 'MacBook Pro', 'ZenBook'],
        'specs': ['i7-1260P', 'Ryzen 7 6800U', '32GB DDR5', '1TB SSD', 'RTX 3050'],
        'features': ['4K touchscreen', 'backlit keyboard', 'fingerprint reader', 'Thunderbolt 4']
    },
    'Smartphone': {
        'models': ['Galaxy S23', 'iPhone 15', 'Pixel 8', 'Nord 3', 'Redmi Note 12'],
        'specs': ['Snapdragon 8 Gen2', 'A16 Bionic', '50MP camera', '120Hz AMOLED'],
        'features': ['5G', 'IP68 rating', 'wireless charging', 'under-display fingerprint']
    }
}

# ====================== DATA GENERATION FUNCTIONS ======================

def generate_products(num_products=200):
    products = []
    for _ in range(num_products):
        category = np.random.choice(PRODUCT_CATEGORIES)
        brand = np.random.choice(BRANDS[category])
        cost = round(np.random.uniform(150, 2000), 2)
        
        products.append({
            'product_id': fake.unique.bothify(text='PROD-#####'),
            'product_name': f"{brand} {np.random.choice(PRODUCT_SPECS.get(category, {}).get('models', ['']))}",
            'category': category,
            'brand': brand,
            'cost_price': cost,
            'base_price': round(cost * np.random.uniform(1.3, 2.2), 2),
            'specs': np.random.choice(PRODUCT_SPECS.get(category, {}).get('specs', ['']))
        })
    return pd.DataFrame(products)

def generate_users(num_users=1000):
    users = []
    for _ in range(num_users):
        users.append({
            'user_id': fake.unique.bothify(text='USER-#####'),
            'name': fake.name(),
            'email': fake.email(),
            'location': fake.country(),
            'signup_date': fake.date_between(start_date='-2y', end_date='today')
        })
    return pd.DataFrame(users)

def generate_review(product_row):
    category = product_row['category']
    brand = product_row['brand']
    
    # Rating distribution
    rating = np.random.choice([5,4,3,2,1], p=[0.35,0.3,0.2,0.1,0.05])
    sentiment = "positive" if rating >=4 else "negative" if rating <=2 else "neutral"
    
    # Get template
    template_group = REVIEW_TEMPLATES.get(category, {})
    templates = template_group.get(sentiment, [])
    
    if not templates:
        return rating, "It's okay. Does what I need."
    
    template = random.choice(templates)
    
    # Fill template
    replacements = {
        'brand': brand,
        'model': product_row['product_name'].split()[-1],
        'feature': np.random.choice(PRODUCT_SPECS.get(category, {}).get('features', [''])),
        'hours': random.randint(4, 18),
        'days': random.randint(3, 60),
        'task': random.choice(['gaming', 'video editing', 'multitasking']),
        'use_case': random.choice(['work', 'studying', 'content creation']),
        'issue': random.choice(['cracking', 'overheating', 'malfunctioning']),
        'consequence': random.choice(['Lost data', 'Missed deadline', 'Frustrating experience']),
        'spec': np.random.choice(PRODUCT_SPECS.get(category, {}).get('specs', [''])),
        'minor_issue': random.choice(['the weight was a bit lighter', 'the screen was a bit brighter']),
        'component': random.choice(['screen', 'battery', 'keyboard']),
        'basic_use': random.choice(['browsing', 'watching videos']),
        'specific_issue': random.choice(['feels flimsy', 'is unresponsive']),
        'specific_praise': random.choice(['the performance is top-notch', 'the design is sleek']),
        'specific_benefit': random.choice(['it saves a lot of time', 'it is very user-friendly']),
        'photo_type': random.choice(['portrait', 'landscape']),
        'support_exp': random.choice(['helpful', 'unhelpful']),
        'frustration_phrase': random.choice(['Very frustrating!', 'Not worth the money.']),
        'recurring_issue': random.choice(['freezing', 'restarting']),
        'alternative': random.choice(['another brand', 'a different model']),
        'casual_remark': random.choice(['Highly recommend!', 'Would buy again.']),
        'common_task': random.choice(['browsing', 'texting', 'calling'])
    }
    
    review = template.format(**replacements)
    
    # Add natural language elements
    if random.random() > 0.5:
        review += random.choice([' BTW, ', ' FYI, ', ' PS: ']) + fake.sentence()
    
    return rating, review

def generate_transactions(products, users, days=90):
    transactions = []
    current_date = datetime.now()
    
    for _ in range(5000):  # Generate 5000 events
        user = users.sample(1).iloc[0]
        product = products.sample(1).iloc[0]
        event_date = current_date - timedelta(days=random.randint(0, days))
        
        event_type = np.random.choice(
            ['browse', 'purchase', 'review'],
            p=[0.6, 0.3, 0.1]
        )
        
        transaction = {
            'event_id': fake.unique.bothify(text='EVENT-#####'),
            'user_id': user['user_id'],
            'product_id': product['product_id'],
            'event_type': event_type,
            'timestamp': event_date.strftime('%Y-%m-%d %H:%M:%S'),
            'session_sec': np.random.randint(30, 1800) if event_type == 'browse' else None,
            'quantity': np.random.randint(1, 3) if event_type == 'purchase' else None,
            'price_paid': round(product['base_price'] * np.random.uniform(0.85, 1.15), 2) if event_type == 'purchase' else None,
            'rating': None,
            'review': None
        }
        
        if event_type == 'review':
            rating, review = generate_review(product)
            transaction.update({
                'rating': rating,
                'review': review
            })
        
        transactions.append(transaction)
    
    return pd.DataFrame(transactions)

def generate_inventory(products):
    inventory = []
    for _, product in products.iterrows():
        inventory.append({
            'product_id': product['product_id'],
            'current_stock': np.random.randint(0, 100),
            'restock_level': np.random.randint(20, 50),
            'lead_time': np.random.choice([3, 7, 14], p=[0.6, 0.3, 0.1]),
            'last_restock': fake.date_between(start_date='-30d', end_date='today')
        })
    return pd.DataFrame(inventory)

def generate_competitor_prices(products):
    competitors = ['Amazon', 'Walmart', 'BestBuy', 'Newegg', 'Target']
    prices = []
    for _, product in products.iterrows():
        for competitor in competitors:
            prices.append({
                'product_id': product['product_id'],
                'competitor': competitor,
                'price': round(product['base_price'] * np.random.uniform(0.85, 1.25), 2),
                'timestamp': fake.date_time_between(start_date='-30d', end_date='now')
            })
    return pd.DataFrame(prices)

# ====================== GENERATE ALL DATASETS ======================
print("Generating datasets...")
products_df = generate_products()
users_df = generate_users()
transactions_df = generate_transactions(products_df, users_df)
inventory_df = generate_inventory(products_df)
competitor_prices_df = generate_competitor_prices(products_df)

# ====================== SAVE TO CSV ======================
datasets = {
    'products': products_df,
    'users': users_df,
    'transactions': transactions_df,
    'inventory': inventory_df,
    'competitor_prices': competitor_prices_df
}

for name, df in datasets.items():
    df.to_csv(f'{name}.csv', index=False)
    print(f"Saved {len(df)} records to {name}.csv")

print("\nSample review data:")
print(transactions_df[transactions_df['review'].notnull()][['product_id', 'rating', 'review']].head(3).to_string(index=False))

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Generating datasets...
Saved 200 records to products.csv
Saved 1000 records to users.csv
Saved 5000 records to transactions.csv
Saved 200 records to inventory.csv
Saved 1000 records to competitor_prices.csv

Sample review data:
product_id  rating                                                                                                                                         review
PROD-60315     4.0                                                                                                                   It's okay. Does what I need.
PROD-75942     5.0                                                                                                                   It's okay. Does what I need.
P