# Stripe Migration Analysis

This notebook performs a comprehensive analysis of migrating customers to new pricing plans.


## 1. Setup and Imports


In [1]:
import pandas as pd
from pathlib import Path
from typing import List, Optional, Literal
from pydantic import BaseModel, ConfigDict
from pydantic.alias_generators import to_camel

# Set pandas display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)


## 2. Configuration and Constants


In [2]:
# Guardrail settings
GUARDRAIL_ORG_COUNT = False

# Brand Plans (IN_HOUSE customers)
BRAND_PLANS = {
    "starter": {
        "price": 89,
        "credits": 3560,
        "price_per_credit": 89 / 3560,
        "min_amount": 1,
        "max_org_count": 1,
    },
    "pro": {
        "price": 199,
        "credits": 14925,
        "price_per_credit": 199 / 14925,
        "min_amount": 1,
        "max_org_count": 3,
    },
    "enterprise": {
        "price": 499,
        "credits": 49900,
        "price_per_credit": 499 / 49900,
        "min_amount": 1,
        "max_org_count": 5,
    },
}

# Agency Plans
AGENCY_PLANS = {
    "intro": {
        "price": 89,
        "credits": 2250,
        "price_per_credit": 89 / 2250,
        "min_amount": 1,
        "max_org_count": 10,
    },
    "growth": {
        "price": 199,
        "credits": 12935,
        "price_per_credit": 199 / 12935,
        "min_amount": 1,
        "max_org_count": 30,
    },
    "scale": {
        "price": 499,
        "credits": 37425,
        "price_per_credit": 499 / 37425,
        "min_amount": 1,
        "max_org_count": 50,
    },
}

# Model pricing (credits per prompt)
MODEL_ID_PRICE_MAP = {
    "gpt-4o": 1,
    "chatgpt": 1,
    "sonar": 1,
    "google-ai-overview": 1,
    "llama-3-3-70b-instruct": 0.5,
    "gpt-4o-search": 1,
    "claude-sonnet-4": 2,
    "claude-3-5-haiku": 2,
    "gemini-1-5-flash": 1,
    "deepseek-r1": 1,
    "gemini-2-5-flash": 2,
    "google-ai-mode": 1,
    "grok-2-1212": 2,
    "gpt-3-5-turbo": 1,
}


## 3. Data Models


In [3]:
class CamelCaseModel(BaseModel):
    """Base model for camelCase to snake_case conversion"""
    model_config = ConfigDict(
        alias_generator=to_camel,
        populate_by_name=True,
    )

class Organization(CamelCaseModel):
    id: str
    company_id: str
    model_ids: List[str]
    prompt_limit: int
    prompts_count: int
    chat_interval_in_hours: int

class Company(CamelCaseModel):
    id: str
    name: str
    type: Literal["IN_HOUSE", "AGENCY", "PARTNER"]
    domain: Optional[str] = None
    stripe_customer_id: str
    stripe_subscription_id: str

class SubscriptionItem(CamelCaseModel):
    customer_id: str
    plan_id: str
    mrr_cents: float
    quantity: int
    interval: str
    interval_count: int
    subscription_discounts: List[str]
    discounts: List[str]

class MigrationOutput(BaseModel):
    company_name: str
    company_domain: Optional[str]
    company_type: Literal["IN_HOUSE", "AGENCY", "PARTNER"]
    orgs_count: int
    orgs_count_hf: int
    current_mrr: int
    current_arr: int
    interval: str
    discount: int
    discounts: str
    prompt_usage: int
    prompt_capacity: int
    credits_usage: int
    credits_capacity: int
    plan_name: str
    mrr: int
    mrr_change: int
    arr_change: int
    extra_credits_purchased: int
    surplus_credits: int

print("✓ Models defined")


✓ Models defined


## 4. Calculation Functions


In [4]:
def calculate_credits_usage(row: pd.Series) -> int:
    """Calculate required credits based on model usage and run frequency."""
    runs_per_month = 30
    model_prices = [MODEL_ID_PRICE_MAP.get(mid, 0) for mid in row["model_ids"]]
    return int(sum(model_prices) * row["prompts_count"] * runs_per_month)

def calculate_credits_capacity(row: pd.Series) -> int:
    """Calculate required credits based on prompt capacity and run frequency."""
    runs_per_month = 30
    model_prices = [MODEL_ID_PRICE_MAP.get(mid, 0) for mid in row["model_ids"]]
    return int(sum(model_prices) * row["prompt_limit"] * runs_per_month)

def calculate_coupon_multiplier(coupon_ids: list, coupons_map: dict) -> tuple[float, int, int]:
    """Calculate discount multiplier for coupon IDs (forever or 12+ months)."""
    if not coupon_ids:
        return 1.0, 0, 0
    
    total_percent_off = 0.0
    total_amount_off = 0.0
    long_term_discount_count = 0
    total_discount_count = 0
    
    for coupon_id in coupon_ids:
        coupon = coupons_map.get(coupon_id)
        if not coupon:
            continue
        
        total_discount_count += 1
        
        # Only account for long-term discounts
        duration = coupon.get('duration')
        duration_in_months = coupon.get('duration_in_months')
        
        should_account = (
            duration == 'forever' or 
            (duration == 'repeating' and duration_in_months and duration_in_months >= 12)
        )
        
        if not should_account:
            continue
        
        long_term_discount_count += 1
        
        if coupon.get('percent_off'):
            total_percent_off += coupon['percent_off']
        
        if coupon.get('amount_off'):
            total_amount_off += coupon['amount_off']
    
    multiplier = 1.0 - (total_percent_off / 100.0)
    return max(0.0, multiplier), long_term_discount_count, total_discount_count

def calculate_scenarios_for_company(company_data: pd.Series) -> pd.Series:
    """Calculate migration scenarios for a single company."""
    company_type = company_data["type"]
    current_mrr = company_data["current_mrr"]
    credits_capacity = company_data["credits_capacity"]
    orgs_count = company_data["orgs_count"]

    all_plans = BRAND_PLANS if company_type == "IN_HOUSE" else AGENCY_PLANS

    # Filter plans based on org count
    plans = {
        name: p for name, p in all_plans.items() 
        if not GUARDRAIL_ORG_COUNT or p["max_org_count"] >= orgs_count
    }

    # If no plan fits, use the largest one
    if not plans:
        largest_plan_name = max(all_plans, key=lambda k: all_plans[k]["max_org_count"])
        plans = {largest_plan_name: all_plans[largest_plan_name]}

    # Calculate least cost scenario
    options = []
    for name, plan in plans.items():
        extra_credits_needed = max(0, credits_capacity - plan["credits"])
        if extra_credits_needed > 0 and extra_credits_needed < plan["min_amount"]:
            extra_credits_needed = plan["min_amount"]

        cost = plan["price"] + extra_credits_needed * plan["price_per_credit"]
        total_credits = plan["credits"] + extra_credits_needed

        options.append({
            "plan_name": f"{name} ({plan['credits']})",
            "cost": cost,
            "extra_credits": extra_credits_needed,
            "surplus_credits": total_credits - credits_capacity,
        })

    best_least_cost = min(options, key=lambda x: x["cost"])

    return pd.Series({
        "plan_name": best_least_cost["plan_name"],
        "mrr": int(best_least_cost["cost"]),
        "mrr_change": int(best_least_cost["cost"] - current_mrr),
        "arr_change": int((best_least_cost["cost"] - current_mrr) * 12),
        "extra_credits_purchased": int(best_least_cost["extra_credits"]),
        "surplus_credits": int(best_least_cost["surplus_credits"]),
    })

print("✓ Calculation functions defined")


✓ Calculation functions defined


## 5. Load Data


In [5]:
# Define data paths
base_path = Path.cwd().parent.parent
data_path = base_path / "data"

print(f"Loading data from: {data_path}")


Loading data from: /Users/matevz/dev/peec-ai/stripe-migration-analysis/data


In [6]:
def load_json(file_path: Path):
    """Load JSON and replace NaN with None."""
    df_raw = pd.read_json(file_path)
    records = df_raw.replace({float("nan"): None}).to_dict("records")
    return records

# Load all data files
print("Loading source data...")
companies_raw = load_json(data_path / "processed_companies.json")
orgs_raw = load_json(data_path / "processed_organizations.json")
subs_raw = load_json(data_path / "stripe_subscription_items.json")
coupons_raw = load_json(data_path / "stripe_coupons.json")
prices_raw = load_json(data_path / "stripe_prices.json")
products_raw = load_json(data_path / "stripe_products.json")

print(f"✓ Loaded {len(companies_raw)} companies")
print(f"✓ Loaded {len(orgs_raw)} organizations")
print(f"✓ Loaded {len(subs_raw)} subscription items")
print(f"✓ Loaded {len(coupons_raw)} coupons")
print(f"✓ Loaded {len(prices_raw)} prices")
print(f"✓ Loaded {len(products_raw)} products")


Loading source data...
✓ Loaded 11606 companies
✓ Loaded 3283 organizations
✓ Loaded 1518 subscription items
✓ Loaded 48 coupons
✓ Loaded 177 prices
✓ Loaded 47 products


## 6. Filter and Validate Data


In [7]:
# Filter for active subscriptions
companies_filtered = [
    c for c in companies_raw
    if c["stripeSubscriptionId"]
    and c["stripeCustomerId"]
    and c["stripeSubscriptionStatus"] == "active"
]

print(f"Filtered to {len(companies_filtered)} companies with active subscriptions")

# Validate with Pydantic models
companies = [Company.model_validate(c) for c in companies_filtered]
orgs = [Organization.model_validate(o) for o in orgs_raw]
subs = [SubscriptionItem.model_validate(s) for s in subs_raw]

# Create coupon lookup
coupons_map = {c["id"]: c for c in coupons_raw}

print("✓ Data validated")


Filtered to 1253 companies with active subscriptions
✓ Data validated


## 7. Create DataFrames


In [8]:
# Convert to DataFrames
companies_df = pd.DataFrame([c.model_dump() for c in companies])
orgs_df = pd.DataFrame([o.model_dump() for o in orgs])
subs_df = pd.DataFrame([s.model_dump() for s in subs])
prices_df = pd.DataFrame(prices_raw)
products_df = pd.DataFrame(products_raw)

print("DataFrames created:")
print(f"  Companies: {companies_df.shape}")
print(f"  Organizations: {orgs_df.shape}")
print(f"  Subscriptions: {subs_df.shape}")
print(f"  Prices: {prices_df.shape}")
print(f"  Products: {products_df.shape}")


DataFrames created:
  Companies: (1253, 6)
  Organizations: (3283, 6)
  Subscriptions: (1518, 8)
  Prices: (177, 19)
  Products: (47, 20)


In [9]:
# Quick exploration
companies_df.head()


Unnamed: 0,id,name,type,domain,stripe_customer_id,stripe_subscription_id
0,co_0066473d-9106-4e5f-b13c-7b756c207675,Flying Cat,AGENCY,flyingcatmarketing.com,cus_T6T5U4MOGZ7ntZ,sub_1SDr10KojVEYZPlXy4X5spim
1,co_00e9c907-6659-4829-9a93-558923266790,Wickey,IN_HOUSE,wickey.de,cus_T40lDnXfwWLTJe,sub_1SD0bmKojVEYZPlXA2382RPf
2,co_0158fa71-9257-4339-83c1-5bbc0aef17b9,FeetFinder,IN_HOUSE,feetfinder.com,cus_Sf2TtGmye6JG1d,sub_1RjiSbKojVEYZPlXyPwm2bQC
3,co_018bcf89-5317-4104-88df-9f6e77a52276,TrueClicks,IN_HOUSE,trueclicks.com,cus_SytMawkXSQzCwo,sub_1S2vbmKojVEYZPlX79pt2DcB
4,co_01dbefdf-03bd-4788-90c3-8aeb11c359f7,CommsCo,AGENCY,thecommsco.com,cus_SlmRXxPfcvT5vJ,sub_1RqEu2KojVEYZPlX9mMIMKnT


## 8. Calculate Organization Credits


In [10]:
# Calculate credits for each organization
orgs_df["credits_usage"] = orgs_df.apply(calculate_credits_usage, axis=1)
orgs_df["credits_capacity"] = orgs_df.apply(calculate_credits_capacity, axis=1)

print("✓ Calculated credits for organizations")
orgs_df[["company_id", "prompts_count", "prompt_limit", "credits_usage", "credits_capacity"]].head()


✓ Calculated credits for organizations


Unnamed: 0,company_id,prompts_count,prompt_limit,credits_usage,credits_capacity
0,co_1fea122e-be87-47f8-b459-4bb426706d35,23,30,2760,3600
1,co_f5267b94-1922-4312-8e0d-b3b2b20864fa,21,25,2205,2625
2,co_ae8a374a-3893-4c21-857c-1bea3a469807,34,30,3060,2700
3,co_1f216996-4e82-46a6-9089-97c619ecf16c,10,120,1200,14400
4,co_66c7c3ff-4038-4dcc-b225-89f7fc5e212a,33,55,6930,11550


## 9. Process Subscription Data


In [11]:
# Join subscription items with prices and products
subs_with_product = subs_df.merge(
    prices_df[["id", "product"]],
    left_on="plan_id",
    right_on="id",
    how="left",
    suffixes=("", "_price"),
)

subs_with_product = subs_with_product.merge(
    products_df[["id", "metadata"]],
    left_on="product",
    right_on="id",
    how="left",
    suffixes=("", "_product"),
)

print("✓ Merged subscription data with products")


✓ Merged subscription data with products


In [12]:
# Extract prompt limits from workspace products
def extract_prompt_limit(row):
    metadata = row.get("metadata", {})
    if not isinstance(metadata, dict):
        return 0
    
    product_type = metadata.get("type", "")
    if product_type != "WORKSPACE":
        return 0
    
    prompt_limit_str = metadata.get("promptLimit", "0")
    try:
        return int(prompt_limit_str)
    except (ValueError, TypeError):
        return 0

subs_with_product["prompt_limit_per_item"] = subs_with_product.apply(extract_prompt_limit, axis=1)
subs_with_product["total_prompt_limit"] = (
    subs_with_product["prompt_limit_per_item"] * subs_with_product["quantity"]
)

# Calculate prompt capacity per customer
customer_prompt_capacity = (
    subs_with_product.groupby("customer_id")
    .agg(prompt_capacity=("total_prompt_limit", "sum"))
    .reset_index()
)

print("✓ Calculated prompt capacity per customer")


✓ Calculated prompt capacity per customer


## 10. Aggregate Company Metrics


In [13]:
# Aggregate credits by company
company_credits = (
    orgs_df.groupby("company_id")
    .agg(
        prompt_usage=("prompts_count", "sum"),
        credits_capacity=("credits_capacity", "sum"),
        credits_usage=("credits_usage", "sum"),
    )
    .reset_index()
)

# Count orgs per company
orgs_count_df = orgs_df.groupby("company_id").size().reset_index(name="orgs_count")

# Count high-frequency orgs (more than once a day)
high_freq_orgs_df = (
    orgs_df[
        (orgs_df["chat_interval_in_hours"] < 24)
        & (orgs_df["chat_interval_in_hours"] > 0)
    ]
    .groupby("company_id")
    .size()
    .reset_index(name="orgs_count_hf")
)

print("✓ Aggregated company metrics")


✓ Aggregated company metrics


## 11. Calculate Current MRR with Discounts


In [14]:
# Calculate base MRR
subs_df["base_mrr_cents"] = subs_df["mrr_cents"] * subs_df["quantity"]

# Apply item-level discounts
def apply_item_discount(row):
    item_discounts = row["discounts"]
    multiplier, long_term_count, total_count = calculate_coupon_multiplier(
        item_discounts, coupons_map
    )
    return pd.Series({
        "mrr_after_item_discounts": row["base_mrr_cents"] * multiplier,
        "item_discount_long_term_count": long_term_count,
        "item_discount_total_count": total_count,
    })

item_discount_results = subs_df.apply(apply_item_discount, axis=1)
subs_df["mrr_after_item_discounts"] = item_discount_results["mrr_after_item_discounts"]
subs_df["item_discount_long_term_count"] = item_discount_results["item_discount_long_term_count"]
subs_df["item_discount_total_count"] = item_discount_results["item_discount_total_count"]

print("✓ Applied item-level discounts")


✓ Applied item-level discounts


In [15]:
# Apply subscription-level discounts
subscription_discounts_df = (
    subs_df.groupby("customer_id")["subscription_discounts"]
    .first()
    .reset_index()
)

def get_subscription_multiplier(row):
    sub_discounts = row["subscription_discounts"]
    multiplier, long_term_count, total_count = calculate_coupon_multiplier(
        sub_discounts, coupons_map
    )
    return pd.Series({
        "sub_discount_multiplier": multiplier,
        "sub_discount_long_term_count": long_term_count,
        "sub_discount_total_count": total_count,
    })

sub_discount_results = subscription_discounts_df.apply(get_subscription_multiplier, axis=1)
subscription_discounts_df["sub_discount_multiplier"] = sub_discount_results["sub_discount_multiplier"]
subscription_discounts_df["sub_discount_long_term_count"] = sub_discount_results["sub_discount_long_term_count"]
subscription_discounts_df["sub_discount_total_count"] = sub_discount_results["sub_discount_total_count"]

# Merge back to subs
subs_df = pd.merge(
    subs_df,
    subscription_discounts_df[[
        "customer_id",
        "sub_discount_multiplier",
        "sub_discount_long_term_count",
        "sub_discount_total_count",
    ]],
    on="customer_id",
    how="left",
)

# Apply subscription discounts on top of item discounts
subs_df["discounted_mrr_cents"] = (
    subs_df["mrr_after_item_discounts"] * subs_df["sub_discount_multiplier"]
)

print("✓ Applied subscription-level discounts")


✓ Applied subscription-level discounts


In [16]:
# Aggregate MRR by customer
customer_mrr = (
    subs_df.groupby("customer_id")
    .agg({
        "base_mrr_cents": "sum",
        "discounted_mrr_cents": "sum",
        "item_discount_long_term_count": "sum",
        "item_discount_total_count": "sum",
        "sub_discount_long_term_count": "first",
        "sub_discount_total_count": "first",
    })
    .reset_index()
)

# Calculate discount counts and percentages
customer_mrr["applied_discounts"] = (
    customer_mrr["item_discount_long_term_count"]
    + customer_mrr["sub_discount_long_term_count"]
).astype(int)

customer_mrr["total_discounts"] = (
    customer_mrr["item_discount_total_count"]
    + customer_mrr["sub_discount_total_count"]
).astype(int)

customer_mrr["discounts_formatted"] = customer_mrr.apply(
    lambda row: f"{row['applied_discounts']} ({row['total_discounts']})", axis=1
)

customer_mrr["current_mrr"] = customer_mrr["discounted_mrr_cents"] / 100
customer_mrr["current_arr"] = customer_mrr["current_mrr"] * 12

customer_mrr["discount_pct"] = (
    ((1 - (customer_mrr["discounted_mrr_cents"] / customer_mrr["base_mrr_cents"])) * 100)
    .fillna(0)
    .round(0)
    .astype(int)
)

print("✓ Calculated customer MRR with discounts")
customer_mrr[["customer_id", "current_mrr", "current_arr", "discount_pct", "discounts_formatted"]].head()


✓ Calculated customer MRR with discounts


Unnamed: 0,customer_id,current_mrr,current_arr,discount_pct,discounts_formatted
0,cus_Rg0jFvpFlajive,267.0,3204.0,0,1 (1)
1,cus_RgT8o2F5OOUaSZ,138.0,1656.0,0,0 (0)
2,cus_RkVsLVkfHVfbgw,1300.0,15600.0,0,0 (0)
3,cus_Rle9u38yh4qQwV,350.0,4200.0,0,0 (0)
4,cus_RlnXKGF0oxvbL9,180.0,2160.0,0,0 (0)


## 12. Format Billing Intervals


In [17]:
# Get main subscription interval (highest MRR item)
main_subscription = subs_df.loc[
    subs_df.groupby("customer_id")["base_mrr_cents"].idxmax()
]

def format_interval(row):
    if row["interval_count"] != 1:
        return f"{row['interval']} ({row['interval_count']})"
    return row["interval"]

main_subscription["interval"] = main_subscription.apply(format_interval, axis=1)
customer_interval = main_subscription[["customer_id", "interval"]]

print("✓ Formatted billing intervals")


✓ Formatted billing intervals


## 13. Merge All Data


In [18]:
# Merge all data into a single DataFrame
merged_df = pd.merge(
    companies_df, company_credits, left_on="id", right_on="company_id", how="inner"
)
merged_df = pd.merge(merged_df, orgs_count_df, on="company_id", how="inner")
merged_df = pd.merge(merged_df, high_freq_orgs_df, on="company_id", how="left")
merged_df = pd.merge(
    merged_df,
    customer_mrr[[
        "customer_id",
        "current_mrr",
        "current_arr",
        "discount_pct",
        "discounts_formatted",
    ]],
    left_on="stripe_customer_id",
    right_on="customer_id",
    how="inner",
)
merged_df = pd.merge(merged_df, customer_interval, on="customer_id", how="inner")
merged_df = pd.merge(merged_df, customer_prompt_capacity, on="customer_id", how="left")

print(f"✓ Merged data: {merged_df.shape}")


✓ Merged data: (1212, 19)


In [19]:
# Fill missing values and cast types
merged_df["credits_capacity"] = merged_df["credits_capacity"].fillna(0).astype(int)
merged_df["credits_usage"] = merged_df["credits_usage"].fillna(0).astype(int)
merged_df["current_mrr"] = merged_df["current_mrr"].fillna(0).astype(int)
merged_df["current_arr"] = merged_df["current_arr"].fillna(0).astype(int)
merged_df["discount"] = merged_df["discount_pct"].fillna(0).astype(int)
merged_df["discounts"] = merged_df["discounts_formatted"].fillna("0 (0)")
merged_df["prompt_usage"] = merged_df["prompt_usage"].fillna(0).astype(int)
merged_df["prompt_capacity"] = merged_df["prompt_capacity"].fillna(0).astype(int)
merged_df["orgs_count"] = merged_df["orgs_count"].fillna(0).astype(int)
merged_df["orgs_count_hf"] = merged_df["orgs_count_hf"].fillna(0).astype(int)

print("✓ Cleaned and formatted data")


✓ Cleaned and formatted data


## 14. Calculate Migration Scenarios


In [20]:
print("Calculating migration scenarios for each company...")
scenarios_df = merged_df.apply(calculate_scenarios_for_company, axis=1)

# Combine with merged data
final_df = pd.concat([merged_df, scenarios_df], axis=1)

print("✓ Migration scenarios calculated")


Calculating migration scenarios for each company...
✓ Migration scenarios calculated


## 15. Prepare Final Output


In [21]:
# Rename columns
final_df = final_df.rename(
    columns={
        "name": "company_name",
        "domain": "company_domain",
        "type": "company_type",
    }
)

# Select final columns
output_columns = list(MigrationOutput.model_fields.keys())
final_df = final_df[output_columns]

print(f"Final output shape: {final_df.shape}")
final_df.head(10)


Final output shape: (1212, 20)


Unnamed: 0,company_name,company_domain,company_type,orgs_count,orgs_count_hf,current_mrr,current_arr,interval,discount,discounts,prompt_usage,prompt_capacity,credits_usage,credits_capacity,plan_name,mrr,mrr_change,arr_change,extra_credits_purchased,surplus_credits
0,Flying Cat,flyingcatmarketing.com,AGENCY,7,0,499,5988,month,0,0 (0),197,300,17730,45450,scale (37425),606,107,1284,8025,0
1,Wickey,wickey.de,IN_HOUSE,1,0,199,2388,month,0,0 (0),100,100,9000,9000,pro (14925),199,0,0,0,5925
2,FeetFinder,feetfinder.com,IN_HOUSE,1,0,199,2388,month,0,0 (0),19,100,1710,9000,pro (14925),199,0,0,0,5925
3,TrueClicks,trueclicks.com,IN_HOUSE,1,0,89,1068,month,0,0 (0),9,25,810,2250,starter (3560),89,0,0,0,1310
4,CommsCo,thecommsco.com,AGENCY,4,0,280,3360,month,0,0 (0),76,100,6840,9000,growth (12935),199,-81,-972,0,3935
5,Gear4music,gear4music.com,IN_HOUSE,1,0,199,2388,month,0,0 (0),100,100,9000,9000,pro (14925),199,0,0,0,5925
6,Betmode,betmode.io,IN_HOUSE,1,0,199,2388,month,0,0 (0),31,100,2790,9000,pro (14925),199,0,0,0,5925
7,RivalMind,rivalmind.com,AGENCY,5,0,199,2388,month,0,0 (1),103,30,9270,9000,growth (12935),199,0,0,0,3935
8,Harper James,harperjames.co.uk,IN_HOUSE,1,0,499,5988,month,0,0 (0),189,300,17010,27000,pro (14925),360,-139,-1668,12075,0
9,addmustard,addmustard.com,AGENCY,1,0,199,2388,month,0,0 (0),24,100,2160,2250,intro (2250),89,-110,-1320,0,0


## 16. Analysis Summary


In [22]:
# Key metrics
print("=" * 60)
print("MIGRATION ANALYSIS SUMMARY")
print("=" * 60)
print(f"Total companies analyzed: {len(final_df)}")
print(f"Current total MRR: ${final_df['current_mrr'].sum():,.2f}")
print(f"Current total ARR: ${final_df['current_arr'].sum():,.2f}")
print()
print(f"New total MRR: ${final_df['mrr'].sum():,.2f}")
print(f"New total ARR: ${(final_df['mrr'].sum() * 12):,.2f}")
print()
print(f"Total MRR change: ${final_df['mrr_change'].sum():,.2f}")
print(f"Total ARR change: ${final_df['arr_change'].sum():,.2f}")
print()
print(f"Companies with increased revenue: {(final_df['mrr_change'] > 0).sum()}")
print(f"Companies with decreased revenue: {(final_df['mrr_change'] < 0).sum()}")
print(f"Companies with no change: {(final_df['mrr_change'] == 0).sum()}")
print("=" * 60)


MIGRATION ANALYSIS SUMMARY
Total companies analyzed: 1212
Current total MRR: $241,251.00
Current total ARR: $2,895,309.00

New total MRR: $241,634.00
New total ARR: $2,899,608.00

Total MRR change: $460.00
Total ARR change: $5,308.00

Companies with increased revenue: 130
Companies with decreased revenue: 253
Companies with no change: 829


In [23]:
# Plan distribution
print("\nPlan Distribution:")
print(final_df['plan_name'].value_counts())



Plan Distribution:
plan_name
starter (3560)        461
intro (2250)          259
pro (14925)           230
growth (12935)        214
scale (37425)          39
enterprise (49900)      9
Name: count, dtype: int64


In [24]:
# Company type breakdown
print("\nARR Change by Company Type:")
type_summary = final_df.groupby('company_type').agg({
    'company_name': 'count',
    'current_arr': 'sum',
    'arr_change': 'sum'
}).rename(columns={'company_name': 'count'})
type_summary['new_arr'] = type_summary['current_arr'] + type_summary['arr_change']
print(type_summary)



ARR Change by Company Type:
              count  current_arr  arr_change  new_arr
company_type                                         
AGENCY          511      1631963       89024  1720987
IN_HOUSE        700      1263346      -85852  1177494
PARTNER           1            0        2136     2136


## 17. Top Movers


In [25]:
# Top 10 biggest ARR increases
print("\nTop 10 Biggest ARR Increases:")
top_increases = final_df.nlargest(10, 'arr_change')[[
    'company_name', 'company_type', 'current_arr', 'arr_change', 'plan_name'
]]
print(top_increases.to_string(index=False))



Top 10 Biggest ARR Increases:
     company_name company_type  current_arr  arr_change          plan_name
         primelis       AGENCY        24000      153940      scale (37425)
 Seer Interactive       AGENCY        23587       53940      scale (37425)
          Globant       AGENCY        16200       44280      scale (37425)
             Mito       AGENCY        16200       23040      scale (37425)
     Nerdoptimize       AGENCY        18000       10800      scale (37425)
            Glide     IN_HOUSE         1656       10260 enterprise (49900)
     Growth Plays       AGENCY        29592        8328      scale (37425)
We Communications       AGENCY         2388        6876      scale (37425)
         Peak Ace       AGENCY        15600        5880      scale (37425)
            Attio     IN_HOUSE         1080        4680        pro (14925)


In [26]:
# Top 10 biggest ARR decreases
print("\nTop 10 Biggest ARR Decreases:")
top_decreases = final_df.nsmallest(10, 'arr_change')[[
    'company_name', 'company_type', 'current_arr', 'arr_change', 'plan_name'
]]
print(top_decreases.to_string(index=False))



Top 10 Biggest ARR Decreases:
    company_name company_type  current_arr  arr_change      plan_name
   Butternut Box     IN_HOUSE        51840      -48960    pro (14925)
             DKC       AGENCY        16740      -13416 growth (12935)
       Propellic       AGENCY        25800      -12120  scale (37425)
           rlvnt       AGENCY        11868      -10800   intro (2250)
      Omniscient       AGENCY        15120       -9000  scale (37425)
Limitless Agency       AGENCY        21114       -8522  scale (37425)
      Suchhelden       AGENCY        18720       -7200  scale (37425)
 Evergreen Media       AGENCY        18000       -7200  scale (37425)
         Youtech       AGENCY        10704       -6411 growth (12935)
            GALE       AGENCY        15048       -6408  scale (37425)


## 18. Save Output


In [None]:
output_path = data_path / "migrate.csv"
final_df.to_csv(output_path, index=False)
print(f"✓ Saved output to: {output_path}")


✓ Saved output to: /Users/matevz/dev/peec-ai/stripe-migration-analysis/data/migrate.csv
✓ Saved JSON output to: /Users/matevz/dev/peec-ai/stripe-migration-analysis/data/migrate.json


## 19. Optional: Data Exploration

Use the cells below to explore specific aspects of the data.


In [28]:
# Filter for companies with significant changes
significant_changes = final_df[abs(final_df['arr_change']) > 1000].sort_values(
    'arr_change', ascending=False
)
print(f"Companies with >$1000 ARR change: {len(significant_changes)}")
significant_changes[[
    'company_name', 'company_type', 'orgs_count', 
    'current_arr', 'arr_change', 'plan_name'
]].head(20)


Companies with >$1000 ARR change: 188


Unnamed: 0,company_name,company_type,orgs_count,current_arr,arr_change,plan_name
1111,primelis,AGENCY,75,24000,153940,scale (37425)
979,Seer Interactive,AGENCY,25,23587,53940,scale (37425)
846,Globant,AGENCY,3,16200,44280,scale (37425)
100,Mito,AGENCY,20,16200,23040,scale (37425)
881,Nerdoptimize,AGENCY,8,18000,10800,scale (37425)
468,Glide,IN_HOUSE,3,1656,10260,enterprise (49900)
416,Growth Plays,AGENCY,17,29592,8328,scale (37425)
777,We Communications,AGENCY,8,2388,6876,scale (37425)
254,Peak Ace,AGENCY,13,15600,5880,scale (37425)
575,Attio,IN_HOUSE,2,1080,4680,pro (14925)


In [29]:
# Look at companies with discounts
with_discounts = final_df[final_df['discount'] > 0].sort_values('discount', ascending=False)
print(f"Companies with discounts: {len(with_discounts)}")
with_discounts[[
    'company_name', 'current_arr', 'discount', 'discounts', 'arr_change'
]].head(20)


Companies with discounts: 15


Unnamed: 0,company_name,current_arr,discount,discounts,arr_change
110,test,0,100,1 (1),1350
113,upvotemarketing.com,0,100,1 (1),2388
213,Handl,0,100,1 (1),2880
248,Jacob McMillen,0,100,1 (1),1068
255,test,0,100,1 (1),1068
259,Smarty Marketing,0,100,1 (1),2136
797,Exposure Ninja,0,100,1 (1),2388
858,directpartnerco,0,100,1 (1),2136
929,Lemlist,0,100,1 (1),4430
994,GPT Insights,0,100,1 (1),2388


In [30]:
# Analyze by organization count
print("\nARR Change by Organization Count:")
final_df['org_bucket'] = pd.cut(
    final_df['orgs_count'], 
    bins=[0, 1, 3, 5, 10, 30, 100], 
    labels=['1', '2-3', '4-5', '6-10', '11-30', '30+']
)
org_analysis = final_df.groupby('org_bucket').agg({
    'company_name': 'count',
    'arr_change': ['sum', 'mean', 'median']
})
print(org_analysis)



ARR Change by Organization Count:
           company_name arr_change                      
                  count        sum          mean  median
org_bucket                                              
1                   940     -73955    -78.675532     0.0
2-3                 130       -425     -3.269231     0.0
4-5                  63     -56731   -900.492063  -679.0
6-10                 48     -42986   -895.541667  -249.0
11-30                28      34381   1227.892857 -1341.0
30+                   3     145024  48341.333333  3204.0


  org_analysis = final_df.groupby('org_bucket').agg({
