In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

# Set plot style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("âœ… Libraries imported successfully")

âœ… Libraries imported successfully


In [5]:
# Load clean data from EDA
df = pd.read_csv('C:/Users/nikhi/OneDrive/Desktop/Python/Data Analysis Projects/revenue_optimization/clean_transactions.csv')

# Convert InvoiceDate back to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

print("="*80)
print("FEATURE ENGINEERING - PRICING OPTIMIZATION")
print("="*80)
print(f"\nðŸ“Š Loaded: {len(df):,} transactions")
print(f"ðŸ“… Date Range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}")
print(f"\nðŸŽ¯ Goal: Build features to predict optimal prices for revenue maximization")

FEATURE ENGINEERING - PRICING OPTIMIZATION

ðŸ“Š Loaded: 530,104 transactions
ðŸ“… Date Range: 2010-12-01 08:26:00 to 2011-12-09 12:50:00

ðŸŽ¯ Goal: Build features to predict optimal prices for revenue maximization


In [7]:
# Create product-level features
print("="*80)
print("STEP 1: PRODUCT-LEVEL FEATURES")
print("="*80)

product_features = df.groupby('StockCode').agg({
    'UnitPrice': ['mean', 'std', 'min', 'max'],
    'Quantity': ['mean', 'sum', 'std'],
    'Revenue': ['sum', 'mean'],
    'InvoiceNo': 'nunique',
    'Description': 'first'
}).reset_index()

# Flatten column names
product_features.columns = ['_'.join(col).strip('_') if col[1] else col[0] 
                             for col in product_features.columns]

# Rename for clarity
product_features.rename(columns={
    'UnitPrice_mean': 'avg_price',
    'UnitPrice_std': 'price_std',
    'UnitPrice_min': 'min_price',
    'UnitPrice_max': 'max_price',
    'Quantity_mean': 'avg_quantity',
    'Quantity_sum': 'total_quantity_sold',
    'Quantity_std': 'quantity_std',
    'Revenue_sum': 'total_revenue',
    'Revenue_mean': 'avg_revenue_per_transaction',
    'InvoiceNo_nunique': 'num_orders',
    'Description_first': 'description'
}, inplace=True)

print(f"\nâœ… Created features for {len(product_features):,} unique products")
print(f"\nSample of product features:")
print(product_features.head(10))

STEP 1: PRODUCT-LEVEL FEATURES

âœ… Created features for 3,922 unique products

Sample of product features:
  StockCode  avg_price  price_std  min_price  max_price  avg_quantity  \
0     10002       1.09       0.37       0.85       1.66         12.11   
1     10080       0.41       0.10       0.39       0.85         13.77   
2     10120       0.21       0.00       0.21       0.21          6.43   
3    10123C       0.65       0.00       0.65       0.65          1.67   
4    10124A       0.42       0.00       0.42       0.42          3.20   
5    10124G       0.42       0.00       0.42       0.42          4.25   
6     10125       0.86       0.27       0.42       1.66         13.79   
7     10133       0.65       0.25       0.42       1.66         14.48   
8     10135       1.41       0.64       0.25       2.51         12.46   
9     11001       1.89       0.85       0.83       3.36         14.04   

   total_quantity_sold  quantity_std  total_revenue  \
0                  860         22

In [9]:
# Check the features
print("\n" + "="*80)
print("PRODUCT FEATURES OVERVIEW")
print("="*80)
print(product_features.describe())

print(f"\nðŸ’¡ We now have baseline metrics for each product:")
print(f"   â†’ Average price, price variation (std)")
print(f"   â†’ Average quantity per order")
print(f"   â†’ Total revenue generated")
print(f"   â†’ Number of orders")


PRODUCT FEATURES OVERVIEW
       avg_price  price_std  min_price  max_price  avg_quantity  \
count    3922.00    3764.00    3922.00    3922.00       3922.00   
mean        8.65       3.92       5.75      17.15         29.28   
std       208.11     154.05     176.72     323.62       1293.40   
min         0.00       0.00       0.00       0.00          1.00   
25%         1.28       0.20       0.79       2.46          2.57   
50%         2.42       0.63       1.63       4.13          5.33   
75%         4.43       1.30       3.32       7.62         10.00   
max     11062.06    9419.77   11062.06   13541.33      80995.00   

       total_quantity_sold  quantity_std  total_revenue  \
count              3922.00       3764.00        3922.00   
mean               1424.88         16.61        2719.71   
std                3585.74         81.90        7979.95   
min                   1.00          0.00           0.00   
25%                  54.00          2.77         128.84   
50%            

In [11]:
# Calculate price elasticity for each product
print("="*80)
print("STEP 2: PRICE ELASTICITY FEATURES")
print("="*80)

# For products with price variation, calculate elasticity
def calculate_price_elasticity(product_df):
    """
    Calculate price elasticity of demand
    Elasticity = % change in quantity / % change in price
    """
    if len(product_df) < 5:  # Need minimum data points
        return np.nan
    
    # Use correlation as proxy for elasticity direction
    if product_df['UnitPrice'].std() < 0.01:  # No price variation
        return 0.0
    
    corr = product_df[['UnitPrice', 'Quantity']].corr().iloc[0, 1]
    return corr

# Calculate elasticity for each product
print("Calculating price elasticity for each product...")
elasticity_list = []

for stock_code in df['StockCode'].unique():
    product_df = df[df['StockCode'] == stock_code]
    elasticity = calculate_price_elasticity(product_df)
    elasticity_list.append({
        'StockCode': stock_code,
        'price_elasticity': elasticity
    })

elasticity_df = pd.DataFrame(elasticity_list)

# Merge with product features
product_features = product_features.merge(elasticity_df, on='StockCode', how='left')

print(f"\nâœ… Price elasticity calculated for all products")
print(f"\nElasticity Distribution:")
print(product_features['price_elasticity'].describe())

# Categorize products by elasticity
product_features['elasticity_category'] = pd.cut(
    product_features['price_elasticity'],
    bins=[-np.inf, -0.2, -0.1, 0.1, np.inf],
    labels=['Highly Elastic', 'Elastic', 'Inelastic', 'Premium/Other']
)

print(f"\nProducts by Elasticity Category:")
print(product_features['elasticity_category'].value_counts())

STEP 2: PRICE ELASTICITY FEATURES
Calculating price elasticity for each product...

âœ… Price elasticity calculated for all products

Elasticity Distribution:
count   3450.00
mean      -0.26
std        0.25
min       -1.00
25%       -0.37
50%       -0.25
75%       -0.15
max        1.00
Name: price_elasticity, dtype: float64

Products by Elasticity Category:
elasticity_category
Highly Elastic    2206
Elastic            613
Inelastic          467
Premium/Other      164
Name: count, dtype: int64


In [13]:
# Show examples of different elasticity levels
print("\n" + "="*80)
print("ELASTICITY EXAMPLES")
print("="*80)

print("\nðŸ”´ HIGHLY ELASTIC (price sensitive - lower price = much higher demand):")
highly_elastic = product_features[product_features['price_elasticity'] < -0.2].nlargest(5, 'num_orders')
print(highly_elastic[['StockCode', 'description', 'avg_price', 'num_orders', 'price_elasticity']])

print("\nðŸŸ¢ INELASTIC (price insensitive - demand stays stable):")
inelastic = product_features[
    (product_features['price_elasticity'] >= -0.1) & 
    (product_features['price_elasticity'] <= 0.1)
].nlargest(5, 'num_orders')
print(inelastic[['StockCode', 'description', 'avg_price', 'num_orders', 'price_elasticity']])

print("\nðŸ’¡ INSIGHT:")
print("   Highly elastic â†’ Can increase volume with lower prices")
print("   Inelastic â†’ Can increase prices without losing much volume")


ELASTICITY EXAMPLES

ðŸ”´ HIGHLY ELASTIC (price sensitive - lower price = much higher demand):
     StockCode                        description  avg_price  num_orders  \
3387    85099B            JUMBO BAG RED RETROSPOT       2.49        2089   
1279     22386            JUMBO BAG PINK POLKADOT       2.60        1219   
908      21931             JUMBO STORAGE BAG SUKI       2.74        1184   
1298     22411  JUMBO SHOPPER VINTAGE RED PAISLEY       2.68        1175   
1824     22960           JAM MAKING SET WITH JARS       5.07        1132   

      price_elasticity  
3387             -0.23  
1279             -0.24  
908              -0.25  
1298             -0.22  
1824             -0.24  

ðŸŸ¢ INELASTIC (price insensitive - demand stays stable):
     StockCode                         description  avg_price  num_orders  \
3407    85123A  WHITE HANGING HEART T-LIGHT HOLDER       3.12        2198   
2670     47566                       PARTY BUNTING       5.79        1685   
1109   

In [15]:
# Calculate revenue optimization potential
print("="*80)
print("STEP 3: REVENUE OPTIMIZATION POTENTIAL")
print("="*80)

# Revenue per unit sold (profitability proxy)
product_features['revenue_per_unit'] = product_features['total_revenue'] / product_features['total_quantity_sold']

# Price premium vs category average (we'll approximate categories by price range)
product_features['price_range'] = pd.cut(
    product_features['avg_price'],
    bins=[0, 2, 5, 10, 20, np.inf],
    labels=['Budget', 'Mid', 'Premium', 'Luxury', 'Ultra-Luxury']
)

# Calculate price premium vs range average
price_range_avg = product_features.groupby('price_range')['avg_price'].transform('mean')
product_features['price_premium_vs_range'] = (product_features['avg_price'] - price_range_avg) / price_range_avg

# Optimization Score = f(elasticity, volume, revenue)
# High volume + inelastic = raise price
# High volume + elastic = lower price for volume
product_features['volume_score'] = (
    (product_features['num_orders'] - product_features['num_orders'].min()) / 
    (product_features['num_orders'].max() - product_features['num_orders'].min())
)

# Optimization potential (simplified)
# Inelastic high-volume = raise price opportunity
# Elastic high-volume = discount opportunity
product_features['price_increase_potential'] = np.where(
    product_features['price_elasticity'] > -0.15,  # Inelastic
    product_features['volume_score'],  # Higher volume = more $ from price increase
    0
)

product_features['discount_potential'] = np.where(
    product_features['price_elasticity'] < -0.20,  # Elastic
    product_features['volume_score'],  # Higher volume = more $ from volume boost
    0
)

print(f"\nâœ… Revenue optimization scores calculated")
print(f"\nTop 10 products for PRICE INCREASES (inelastic + high volume):")
price_increase_opps = product_features.nlargest(10, 'price_increase_potential')
print(price_increase_opps[['StockCode', 'description', 'avg_price', 'num_orders', 
                            'price_elasticity', 'price_increase_potential']].to_string())

STEP 3: REVENUE OPTIMIZATION POTENTIAL

âœ… Revenue optimization scores calculated

Top 10 products for PRICE INCREASES (inelastic + high volume):
     StockCode                         description  avg_price  num_orders  price_elasticity  price_increase_potential
3407    85123A  WHITE HANGING HEART T-LIGHT HOLDER       3.12        2198             -0.10                      1.00
2670     47566                       PARTY BUNTING       5.79        1685             -0.09                      0.77
3194     84879       ASSORTED COLOUR BIRD ORNAMENT       1.72        1455             -0.11                      0.66
1109     22197                SMALL POPCORN HOLDER       1.04        1392             -0.08                      0.63
1593     22720   SET OF 3 CAKE TINS PANTRY DESIGN        5.83        1385             -0.09                      0.63
439      21212     PACK OF 72 RETROSPOT CAKE CASES       0.76        1320             -0.11                      0.60
177      20727             

In [17]:
print("\n" + "="*80)
print("Top 10 products for DISCOUNTS/PROMOTIONS (elastic + high volume):")
discount_opps = product_features.nlargest(10, 'discount_potential')
print(discount_opps[['StockCode', 'description', 'avg_price', 'num_orders', 
                     'price_elasticity', 'discount_potential']].to_string())

print("\nðŸ’¡ STRATEGIC RECOMMENDATIONS:")
print("   â†’ PRICE INCREASE candidates: Inelastic, high-volume products")
print("   â†’ DISCOUNT candidates: Elastic products that can drive volume")


Top 10 products for DISCOUNTS/PROMOTIONS (elastic + high volume):
     StockCode                          description  avg_price  num_orders  price_elasticity  discount_potential
3387    85099B              JUMBO BAG RED RETROSPOT       2.49        2089             -0.23                0.95
1279     22386              JUMBO BAG PINK POLKADOT       2.60        1219             -0.24                0.55
908      21931               JUMBO STORAGE BAG SUKI       2.74        1184             -0.25                0.54
1298     22411    JUMBO SHOPPER VINTAGE RED PAISLEY       2.68        1175             -0.22                0.53
1824     22960             JAM MAKING SET WITH JARS       5.07        1132             -0.24                0.51
2866     82482    WOODEN PICTURE FRAME WHITE FINISH       3.10        1100             -0.24                0.50
1277     22384              LUNCH BAG PINK POLKADOT       2.03        1090             -0.21                0.50
1090     22178      VICTORIAN

In [19]:
# Extract time features from original data
print("="*80)
print("STEP 4: TEMPORAL/SEASONAL FEATURES")
print("="*80)

# Add time features to main dataframe
df['Year'] = df['InvoiceDate'].dt.year
df['Month'] = df['InvoiceDate'].dt.month
df['Quarter'] = df['InvoiceDate'].dt.quarter
df['DayOfWeek'] = df['InvoiceDate'].dt.dayofweek
df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)

# Calculate seasonal revenue patterns by product
seasonal_features = df.groupby(['StockCode', 'Quarter']).agg({
    'Revenue': 'sum',
    'Quantity': 'sum'
}).reset_index()

# Pivot to get Q1, Q2, Q3, Q4 revenue for each product
seasonal_pivot = seasonal_features.pivot(index='StockCode', columns='Quarter', values='Revenue').fillna(0)
seasonal_pivot.columns = [f'Q{col}_revenue' for col in seasonal_pivot.columns]

# Calculate seasonality score (how much revenue varies by quarter)
seasonal_pivot['seasonality_score'] = seasonal_pivot.std(axis=1) / (seasonal_pivot.mean(axis=1) + 1)

# Merge with product features
product_features = product_features.merge(seasonal_pivot[['seasonality_score']], 
                                          on='StockCode', how='left')

print(f"\nâœ… Seasonal features calculated")
print(f"\nSeasonality distribution:")
print(product_features['seasonality_score'].describe())

# Identify highly seasonal products
highly_seasonal = product_features.nlargest(10, 'seasonality_score')
print(f"\nTop 10 Most Seasonal Products:")
print(highly_seasonal[['StockCode', 'description', 'avg_price', 'num_orders', 'seasonality_score']].to_string())

print("\nðŸ’¡ Seasonal products need dynamic pricing strategies!")

STEP 4: TEMPORAL/SEASONAL FEATURES

âœ… Seasonal features calculated

Seasonality distribution:
count   3922.00
mean       0.86
std        0.48
min        0.00
25%        0.47
50%        0.78
75%        1.18
max        2.00
Name: seasonality_score, dtype: float64

Top 10 Most Seasonal Products:
     StockCode                          description  avg_price  num_orders  seasonality_score
2465     23843          PAPER CRAFT , LITTLE BIRDIE       2.08           1               2.00
2406     23581               JUMBO BAG PAISLEY PARK       2.25         338               2.00
3901         B                      Adjust bad debt   11062.06           1               2.00
2407     23582         VINTAGE DOILY JUMBO BAG RED        2.21         259               2.00
2363     23534                STOP FOR TEA WALL ART       7.53         260               2.00
2364     23535              BICYCLE SAFTEY WALL ART       7.42         215               2.00
2219     23382  BOX OF 6 CHRISTMAS CAKE DECORA

In [21]:
# Filter out bad data and add final features
print("="*80)
print("STEP 5: FINAL FEATURE ENGINEERING")
print("="*80)

# Remove obvious non-products
product_features_clean = product_features[
    ~product_features['StockCode'].isin(['DOT', 'POST', 'M', 'B', 'BANK CHARGES', 'C2'])
].copy()

# Remove single-order outliers from seasonality analysis
product_features_clean = product_features_clean[product_features_clean['num_orders'] >= 5]

print(f"âœ… Filtered to {len(product_features_clean):,} products (removed admin codes & single-order items)")

# Add derived features for modeling
# 1. Price volatility (how much does price vary?)
product_features_clean['price_volatility'] = (
    product_features_clean['price_std'] / (product_features_clean['avg_price'] + 0.01)
)

# 2. Order frequency (orders per day)
days_in_dataset = (df['InvoiceDate'].max() - df['InvoiceDate'].min()).days
product_features_clean['orders_per_day'] = product_features_clean['num_orders'] / days_in_dataset

# 3. Average basket size
product_features_clean['avg_basket_size'] = product_features_clean['avg_quantity'] * product_features_clean['avg_price']

# 4. Revenue concentration (what % of total revenue comes from this product?)
total_revenue = product_features_clean['total_revenue'].sum()
product_features_clean['revenue_share'] = product_features_clean['total_revenue'] / total_revenue

# 5. Pricing tier
product_features_clean['pricing_tier'] = pd.cut(
    product_features_clean['avg_price'],
    bins=[0, 1, 2, 5, 10, np.inf],
    labels=['Ultra-Budget', 'Budget', 'Mid', 'Premium', 'Luxury']
)

print("\nâœ… Added derived features:")
print("   â†’ Price volatility")
print("   â†’ Orders per day")
print("   â†’ Average basket size")
print("   â†’ Revenue share")
print("   â†’ Pricing tier")

# Show summary of key features
print("\n" + "="*80)
print("FEATURE SUMMARY")
print("="*80)
print(f"\nTotal features created: {len(product_features_clean.columns)}")
print("\nKey feature columns:")
for col in ['avg_price', 'price_elasticity', 'num_orders', 'total_revenue', 
            'seasonality_score', 'price_volatility', 'revenue_share', 'pricing_tier']:
    print(f"  âœ“ {col}")

STEP 5: FINAL FEATURE ENGINEERING
âœ… Filtered to 3,469 products (removed admin codes & single-order items)

âœ… Added derived features:
   â†’ Price volatility
   â†’ Orders per day
   â†’ Average basket size
   â†’ Revenue share
   â†’ Pricing tier

FEATURE SUMMARY

Total features created: 26

Key feature columns:
  âœ“ avg_price
  âœ“ price_elasticity
  âœ“ num_orders
  âœ“ total_revenue
  âœ“ seasonality_score
  âœ“ price_volatility
  âœ“ revenue_share
  âœ“ pricing_tier


In [23]:
# Quick peek at the final dataset
print("\n" + "="*80)
print("SAMPLE OF FINAL FEATURE SET")
print("="*80)
sample_products = product_features_clean.nlargest(5, 'total_revenue')
print(sample_products[['StockCode', 'description', 'avg_price', 'price_elasticity', 
                       'num_orders', 'total_revenue', 'seasonality_score', 
                       'pricing_tier']].to_string())


SAMPLE OF FINAL FEATURE SET
     StockCode                         description  avg_price  price_elasticity  num_orders  total_revenue  seasonality_score pricing_tier
1310     22423            REGENCY CAKESTAND 3 TIER      13.98             -0.15        1988      174484.74               0.28       Luxury
3407    85123A  WHITE HANGING HEART T-LIGHT HOLDER       3.12             -0.10        2198      104518.80               0.16          Mid
2670     47566                       PARTY BUNTING       5.79             -0.09        1685       99504.33               0.58      Premium
3387    85099B             JUMBO BAG RED RETROSPOT       2.49             -0.23        2089       94340.05               0.18          Mid
2020     23166      MEDIUM CERAMIC TOP STORAGE JAR       1.47             -0.06         247       81700.92               1.85       Budget


In [25]:
# Calculate optimal price estimate
print("="*80)
print("STEP 6: OPTIMAL PRICE TARGET (for ML model)")
print("="*80)

# Simple optimization logic:
# - If inelastic (elasticity > -0.15): suggest price increase
# - If elastic (elasticity < -0.25): suggest price decrease
# - Otherwise: keep current price

def calculate_optimal_price(row):
    """
    Simple heuristic for optimal price
    This will be refined by our ML model
    """
    current_price = row['avg_price']
    elasticity = row['price_elasticity']
    
    if pd.isna(elasticity):
        return current_price
    
    # Inelastic products - can raise price
    if elasticity > -0.15:
        return current_price * 1.10  # +10%
    
    # Highly elastic - lower price to boost volume
    elif elasticity < -0.25:
        return current_price * 0.95  # -5%
    
    # Moderately elastic - small adjustments
    else:
        return current_price * 1.02  # +2%

product_features_clean['optimal_price_estimate'] = product_features_clean.apply(
    calculate_optimal_price, axis=1
)

# Calculate projected revenue impact
product_features_clean['price_change_pct'] = (
    (product_features_clean['optimal_price_estimate'] - product_features_clean['avg_price']) / 
    product_features_clean['avg_price'] * 100
)

# Estimate revenue impact (simplified: assumes some volume change)
# For inelastic: minimal volume loss
# For elastic: volume gain offsets price cut
product_features_clean['projected_revenue_lift'] = np.where(
    product_features_clean['price_elasticity'] > -0.15,
    product_features_clean['total_revenue'] * 0.08,  # 8% lift from price increase
    product_features_clean['total_revenue'] * 0.03   # 3% lift from volume boost
)

print(f"\nâœ… Optimal price estimates calculated")
print(f"\nTop 10 Revenue Lift Opportunities:")
top_opps = product_features_clean.nlargest(10, 'projected_revenue_lift')
print(top_opps[['StockCode', 'description', 'avg_price', 'optimal_price_estimate', 
                'price_change_pct', 'projected_revenue_lift']].to_string())

total_lift = product_features_clean['projected_revenue_lift'].sum()
print(f"\nðŸ’° TOTAL PROJECTED ANNUAL REVENUE LIFT: Â£{total_lift:,.2f}")

STEP 6: OPTIMAL PRICE TARGET (for ML model)

âœ… Optimal price estimates calculated

Top 10 Revenue Lift Opportunities:
     StockCode                         description  avg_price  optimal_price_estimate  price_change_pct  projected_revenue_lift
3407    85123A  WHITE HANGING HEART T-LIGHT HOLDER       3.12                    3.43             10.00                 8361.50
2670     47566                       PARTY BUNTING       5.79                    6.37             10.00                 7960.35
2020     23166      MEDIUM CERAMIC TOP STORAGE JAR       1.47                    1.62             10.00                 6536.07
1942     23084                  RABBIT NIGHT LIGHT       2.38                    2.62             10.00                 5357.20
1310     22423            REGENCY CAKESTAND 3 TIER      13.98                   14.26              2.00                 5234.54
1006     22086     PAPER CHAIN KIT 50'S CHRISTMAS        3.36                    3.69             10.00         

In [27]:
# Feature importance preview - what correlates with revenue?
print("\n" + "="*80)
print("FEATURE CORRELATIONS WITH REVENUE")
print("="*80)

numeric_features = ['avg_price', 'price_elasticity', 'num_orders', 'avg_quantity',
                    'price_volatility', 'seasonality_score', 'orders_per_day', 'revenue_share']

correlations = product_features_clean[numeric_features + ['total_revenue']].corr()['total_revenue'].sort_values(ascending=False)
print(correlations)

print("\nðŸ’¡ These correlations guide our ML model:")
print("   â†’ High positive = feature drives revenue")
print("   â†’ Negative = inverse relationship")


FEATURE CORRELATIONS WITH REVENUE
total_revenue        1.00
revenue_share        1.00
num_orders           0.81
orders_per_day       0.81
avg_quantity         0.17
price_elasticity     0.09
avg_price            0.06
price_volatility     0.04
seasonality_score   -0.13
Name: total_revenue, dtype: float64

ðŸ’¡ These correlations guide our ML model:
   â†’ High positive = feature drives revenue
   â†’ Negative = inverse relationship


In [31]:
# Save the complete feature set
print("="*80)
print("SAVING FEATURE-ENGINEERED DATASET")
print("="*80)

# Save full feature set
product_features_clean.to_csv('C:/Users/nikhi/OneDrive/Desktop/Python/Data Analysis Projects/revenue_optimization/product_features.csv', index=False)
print(f"\nâœ… Saved: C:/Users/nikhi/OneDrive/Desktop/Python/Data Analysis Projects/revenue_optimization/product_features.csv")
print(f"   Shape: {product_features_clean.shape}")
print(f"   Features: {len(product_features_clean.columns)}")

# Also save transaction-level data with features for more detailed modeling
df_with_features = df.merge(
    product_features_clean[['StockCode', 'price_elasticity', 'optimal_price_estimate', 
                            'pricing_tier', 'seasonality_score']],
    on='StockCode',
    how='left'
)

df_with_features.to_csv('C:/Users/nikhi/OneDrive/Desktop/Python/Data Analysis Projects/revenue_optimization/transactions_with_features.csv', index=False)
print(f"\nâœ… Saved: data/processed/transactions_with_features.csv")
print(f"   Shape: {df_with_features.shape}")

print("\n" + "="*80)
print("ðŸŽ‰ FEATURE ENGINEERING COMPLETE!")
print("="*80)
print("""
WHAT WE CREATED:
âœ… 26 features per product
âœ… Price elasticity scores
âœ… Seasonal patterns
âœ… Revenue optimization potential
âœ… Optimal price estimates
âœ… Â£454K projected revenue lift

KEY FEATURES FOR ML MODEL:
â†’ avg_price (current pricing)
â†’ price_elasticity (demand sensitivity)
â†’ num_orders (volume)
â†’ seasonality_score (temporal patterns)
â†’ pricing_tier (market positioning)
â†’ optimal_price_estimate (TARGET VARIABLE)

NEXT STEP: Build XGBoost model to predict optimal prices!
""")

SAVING FEATURE-ENGINEERED DATASET

âœ… Saved: C:/Users/nikhi/OneDrive/Desktop/Python/Data Analysis Projects/revenue_optimization/product_features.csv
   Shape: (3469, 29)
   Features: 29

âœ… Saved: data/processed/transactions_with_features.csv
   Shape: (530104, 18)

ðŸŽ‰ FEATURE ENGINEERING COMPLETE!

WHAT WE CREATED:
âœ… 26 features per product
âœ… Price elasticity scores
âœ… Seasonal patterns
âœ… Revenue optimization potential
âœ… Optimal price estimates
âœ… Â£454K projected revenue lift

KEY FEATURES FOR ML MODEL:
â†’ avg_price (current pricing)
â†’ price_elasticity (demand sensitivity)
â†’ num_orders (volume)
â†’ seasonality_score (temporal patterns)
â†’ pricing_tier (market positioning)
â†’ optimal_price_estimate (TARGET VARIABLE)

NEXT STEP: Build XGBoost model to predict optimal prices!

