# 07 - Geographical Analysis

This notebook performs comprehensive geographical analysis to complement the basic geographical insights in the EDA notebook.

## Objectives
- Analyze sales patterns by country
- Calculate country-level metrics (revenue, customers, transactions)
- Identify top-performing markets
- Analyze geographical concentration (Pareto analysis)
- Compare country-specific trends
- Visualize geographical distribution

## Phase 2 Requirements
- ✅ Country-level aggregation
- ✅ Top markets identification
- ✅ Geographical concentration analysis
- ✅ Business insights for stock allocation


In [None]:
# Load required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("=" * 80)
print("GEOGRAPHICAL ANALYSIS")
print("=" * 80)

# Load data
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
data_path = os.path.join(project_root, 'data', 'raw', 'Online Retail.csv')

df = pd.read_csv(data_path, encoding='latin-1')
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
df = df[df['Description'].notna()]
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
df = df[df['InvoiceDate'].notna()]

print(f"\nDataset loaded: {df.shape[0]:,} transactions")
print(f"Unique countries: {df['Country'].nunique()}")
print(f"Date range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}")


## Step 1: Country-Level Aggregation

Calculate key metrics for each country.


In [None]:
# Country-level aggregation
print("=" * 80)
print("COUNTRY-LEVEL METRICS")
print("=" * 80)

country_stats = df.groupby('Country').agg({
    'TotalPrice': ['sum', 'mean', 'count'],
    'Quantity': 'sum',
    'InvoiceNo': 'nunique',
    'CustomerID': 'nunique',
    'InvoiceDate': ['min', 'max']
}).reset_index()

country_stats.columns = ['Country', 'TotalRevenue', 'AvgTransactionValue', 'TransactionCount',
                        'TotalQuantity', 'UniqueInvoices', 'UniqueCustomers', 'FirstPurchase', 'LastPurchase']

# Calculate additional metrics
country_stats['RevenuePerCustomer'] = country_stats['TotalRevenue'] / country_stats['UniqueCustomers']
country_stats['TransactionsPerCustomer'] = country_stats['TransactionCount'] / country_stats['UniqueCustomers']
country_stats['RevenueShare'] = (country_stats['TotalRevenue'] / country_stats['TotalRevenue'].sum() * 100).round(2)

# Sort by revenue
country_stats = country_stats.sort_values('TotalRevenue', ascending=False).reset_index(drop=True)

print(f"\nTotal Countries: {len(country_stats)}")
print(f"\nTop 10 Countries by Revenue:")
print(country_stats.head(10)[['Country', 'TotalRevenue', 'UniqueCustomers', 'UniqueInvoices', 'RevenueShare']].to_string(index=False))

# Summary statistics
print(f"\nGeographical Summary:")
print(f"  Total Revenue: £{country_stats['TotalRevenue'].sum():,.2f}")
print(f"  Top country: {country_stats.iloc[0]['Country']} (£{country_stats.iloc[0]['TotalRevenue']:,.2f}, {country_stats.iloc[0]['RevenueShare']:.1f}%)")
print(f"  Top 5 countries: {country_stats.head(5)['RevenueShare'].sum():.1f}% of total revenue")
print(f"  Top 10 countries: {country_stats.head(10)['RevenueShare'].sum():.1f}% of total revenue")


## Step 2: Geographical Concentration Analysis

Analyze market concentration using Pareto analysis.


In [None]:
# Pareto analysis
print("=" * 80)
print("PARETO ANALYSIS - MARKET CONCENTRATION")
print("=" * 80)

# Calculate cumulative revenue share
country_stats_sorted = country_stats.sort_values('TotalRevenue', ascending=False).reset_index(drop=True)
country_stats_sorted['CumulativeRevenue'] = country_stats_sorted['TotalRevenue'].cumsum()
country_stats_sorted['CumulativeRevenueShare'] = (country_stats_sorted['CumulativeRevenue'] / country_stats_sorted['TotalRevenue'].sum() * 100).round(2)
country_stats_sorted['CumulativeCountryCount'] = range(1, len(country_stats_sorted) + 1)
country_stats_sorted['CumulativeCountryShare'] = (country_stats_sorted['CumulativeCountryCount'] / len(country_stats_sorted) * 100).round(2)

# Find 80/20 point
p80_countries = country_stats_sorted[country_stats_sorted['CumulativeRevenueShare'] <= 80]
p80_count = len(p80_countries) if len(p80_countries) > 0 else 1

print(f"\nPareto Analysis (80/20 Rule):")
print(f"  Top {p80_count} countries account for 80% of revenue")
print(f"  This represents {p80_count/len(country_stats_sorted)*100:.1f}% of all countries")
print(f"  → Strong market concentration detected")

# Visualize Pareto chart
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Geographical Analysis - Market Concentration', fontsize=16, y=0.995)

# Top 15 countries by revenue
top15 = country_stats_sorted.head(15)
axes[0, 0].barh(range(len(top15)), top15['TotalRevenue'], alpha=0.7, color='steelblue', edgecolor='black')
axes[0, 0].set_yticks(range(len(top15)))
axes[0, 0].set_yticklabels(top15['Country'], fontsize=9)
axes[0, 0].set_xlabel('Total Revenue (£)', fontsize=12)
axes[0, 0].set_title('Top 15 Countries by Revenue', fontweight='bold')
axes[0, 0].grid(True, alpha=0.3, axis='x')
axes[0, 0].invert_yaxis()

# Revenue share pie chart (top 10 + others)
top10_revenue = country_stats_sorted.head(10)['TotalRevenue'].sum()
others_revenue = country_stats_sorted.iloc[10:]['TotalRevenue'].sum()
pie_data = list(country_stats_sorted.head(10)['TotalRevenue']) + [others_revenue]
pie_labels = list(country_stats_sorted.head(10)['Country']) + ['Others']
axes[0, 1].pie(pie_data, labels=pie_labels, autopct='%1.1f%%', startangle=90)
axes[0, 1].set_title('Revenue Distribution (Top 10 + Others)', fontweight='bold')

# Cumulative revenue share
axes[1, 0].plot(country_stats_sorted['CumulativeCountryShare'], 
                country_stats_sorted['CumulativeRevenueShare'], 
                marker='o', linewidth=2, markersize=6, color='coral')
axes[1, 0].axhline(y=80, color='red', linestyle='--', linewidth=2, label='80% Revenue')
axes[1, 0].axvline(x=p80_count/len(country_stats_sorted)*100, color='red', linestyle='--', linewidth=2)
axes[1, 0].set_xlabel('Cumulative Country Share (%)', fontsize=12)
axes[1, 0].set_ylabel('Cumulative Revenue Share (%)', fontsize=12)
axes[1, 0].set_title('Pareto Chart - Market Concentration', fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].legend()

# Revenue per customer by country (top 15)
axes[1, 1].barh(range(len(top15)), top15['RevenuePerCustomer'], alpha=0.7, color='teal', edgecolor='black')
axes[1, 1].set_yticks(range(len(top15)))
axes[1, 1].set_yticklabels(top15['Country'], fontsize=9)
axes[1, 1].set_xlabel('Revenue per Customer (£)', fontsize=12)
axes[1, 1].set_title('Revenue per Customer (Top 15 Countries)', fontweight='bold')
axes[1, 1].grid(True, alpha=0.3, axis='x')
axes[1, 1].invert_yaxis()

plt.tight_layout()
plt.show()

print(f"\nTop 5 Countries Detailed:")
for idx, row in country_stats_sorted.head(5).iterrows():
    print(f"\n  {idx+1}. {row['Country']}:")
    print(f"     Revenue: £{row['TotalRevenue']:,.2f} ({row['RevenueShare']:.1f}%)")
    print(f"     Customers: {row['UniqueCustomers']:,}")
    print(f"     Revenue/Customer: £{row['RevenuePerCustomer']:,.2f}")
    print(f"     Transactions/Customer: {row['TransactionsPerCustomer']:.2f}")


In [None]:
# Temporal trends by country
print("=" * 80)
print("TEMPORAL TRENDS BY COUNTRY")
print("=" * 80)

df['YearMonth'] = df['InvoiceDate'].dt.to_period('M')
df['Year'] = df['InvoiceDate'].dt.year
df['Month'] = df['InvoiceDate'].dt.month

# Monthly revenue by country (top 5)
top5_countries = country_stats_sorted.head(5)['Country'].tolist()
monthly_by_country = df[df['Country'].isin(top5_countries)].groupby(['YearMonth', 'Country'])['TotalPrice'].sum().reset_index()
monthly_by_country['YearMonth'] = monthly_by_country['YearMonth'].astype(str)
monthly_by_country['Date'] = pd.to_datetime(monthly_by_country['YearMonth'])

# Visualize
fig, axes = plt.subplots(2, 1, figsize=(16, 10))
fig.suptitle('Temporal Trends by Country', fontsize=16, y=0.995)

# Line plot for top 5 countries
for country in top5_countries:
    country_data = monthly_by_country[monthly_by_country['Country'] == country]
    axes[0].plot(country_data['Date'], country_data['TotalPrice'], marker='o', label=country, linewidth=2, markersize=4)

axes[0].set_xlabel('Date', fontsize=12)
axes[0].set_ylabel('Monthly Revenue (£)', fontsize=12)
axes[0].set_title('Monthly Revenue Trends - Top 5 Countries', fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Heatmap: Monthly revenue by country (top 10)
top10_countries = country_stats_sorted.head(10)['Country'].tolist()
monthly_top10 = df[df['Country'].isin(top10_countries)].groupby(['YearMonth', 'Country'])['TotalPrice'].sum().reset_index()
pivot_monthly = monthly_top10.pivot(index='Country', columns='YearMonth', values='TotalPrice')
pivot_monthly = pivot_monthly.reindex(top10_countries)

sns.heatmap(pivot_monthly, annot=False, fmt='.0f', cmap='YlOrRd', ax=axes[1], cbar_kws={'label': 'Revenue (£)'})
axes[1].set_title('Monthly Revenue Heatmap - Top 10 Countries', fontweight='bold')
axes[1].set_xlabel('Month', fontsize=12)
axes[1].set_ylabel('Country', fontsize=12)

plt.tight_layout()
plt.show()

# Summary by country
print("\nCountry Performance Summary:")
for country in top5_countries:
    country_data = df[df['Country'] == country]
    print(f"\n  {country}:")
    print(f"    Total Revenue: £{country_data['TotalPrice'].sum():,.2f}")
    print(f"    Avg Monthly Revenue: £{country_data.groupby('YearMonth')['TotalPrice'].sum().mean():,.2f}")
    print(f"    Revenue Growth: {((country_data.groupby('YearMonth')['TotalPrice'].sum().iloc[-1] / country_data.groupby('YearMonth')['TotalPrice'].sum().iloc[0] - 1) * 100) if len(country_data.groupby('YearMonth')['TotalPrice'].sum()) > 1 else 0:.1f}%")


## Step 4: Business Implications & Stock Allocation Recommendations

Translate geographical insights into stock management recommendations.


In [None]:
# Business implications
print("=" * 80)
print("BUSINESS IMPLICATIONS & STOCK ALLOCATION RECOMMENDATIONS")
print("=" * 80)

print("\n1. MARKET PRIORITIZATION:")
print("   Based on revenue concentration:")
print(f"   - Top 5 countries: {country_stats_sorted.head(5)['RevenueShare'].sum():.1f}% of revenue")
print(f"   - Top 10 countries: {country_stats_sorted.head(10)['RevenueShare'].sum():.1f}% of revenue")
print(f"   - Recommendation: Focus stock allocation on top 10 markets")

print("\n2. STOCK ALLOCATION BY COUNTRY:")
print("   Recommended stock allocation percentages:")
total_revenue = country_stats_sorted['TotalRevenue'].sum()
for idx, row in country_stats_sorted.head(10).iterrows():
    stock_allocation = row['RevenueShare'] * 0.8  # 80% of revenue share for stock
    print(f"   {idx+1}. {row['Country']}: {stock_allocation:.1f}% of total stock")
    print(f"      (Revenue: {row['RevenueShare']:.1f}%, Customers: {row['UniqueCustomers']:,})")

print("\n3. HIGH-VALUE MARKETS:")
high_value = country_stats_sorted[country_stats_sorted['RevenuePerCustomer'] > country_stats_sorted['RevenuePerCustomer'].median()]
print(f"   - {len(high_value)} countries with above-median revenue per customer")
print("   - Recommendation: Stock premium products in these markets")
print("   - Top 3 high-value markets:")
for idx, row in high_value.head(3).iterrows():
    print(f"     • {row['Country']}: £{row['RevenuePerCustomer']:,.2f} per customer")

print("\n4. GROWTH MARKETS:")
# Identify countries with increasing trends (simplified)
print("   - Analyze monthly trends to identify growing markets")
print("   - Recommendation: Increase stock allocation for growing markets")
print("   - Monitor top 10 countries for trend changes")

print("\n5. MARKET CONCENTRATION RISK:")
print(f"   - {p80_count} countries account for 80% of revenue")
print(f"   - Risk: High dependence on few markets")
print("   - Recommendation:")
print("     • Diversify stock across more countries")
print("     • Develop strategies for emerging markets")
print("     • Maintain strong stock levels in top 5 markets")

print("\n6. CUSTOMER EFFICIENCY:")
efficient_markets = country_stats_sorted.nlargest(5, 'TransactionsPerCustomer')
print("   - Markets with highest transactions per customer:")
for idx, row in efficient_markets.iterrows():
    print(f"     • {row['Country']}: {row['TransactionsPerCustomer']:.2f} transactions/customer")
print("   - Recommendation: Stock products that encourage repeat purchases in these markets")

print("\n" + "=" * 80)
print("GEOGRAPHICAL ANALYSIS COMPLETE")
print("=" * 80)

# Export results
output_path = os.path.join(project_root, 'data', 'processed', 'country_statistics.csv')
country_stats_sorted.to_csv(output_path, index=False)
print(f"\nResults exported to: {output_path}")
