# Stripe Migration Analysis

This notebook performs a comprehensive analysis of migrating customers to new pricing plans.


## 1. Setup and Imports


In [52]:
import pandas as pd
from pathlib import Path



Company

# Set pandas display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)


## 2. Configuration and Constants


In [53]:
# Guardrail settings
GUARDRAIL_ORG_COUNT = False

# Brand Plans (IN_HOUSE customers)
BRAND_PLANS = {
    "starter": {
        "month": {
            "price": 89,
            "credits": 3560,
            "price_per_credit": 89 / 3560,
        },
        "year": {
            "price": 89 * 10,
            "credits": 3560 * 10,
            "price_per_credit": 89 * 10 / 3560 * 10,
        }
    },
    "pro": {
        "month": {
            "price": 199,
            "credits": 14925,
            "price_per_credit": 199 / 14925,
        },
        "year": {
            "price": 199 * 10,
            "credits": 14925 * 10,
            "price_per_credit": 199 * 10 / 14925 * 10,
        }
    },
    "enterprise": {
        "month": {
            "price": 499,
            "credits": 49900,
            "price_per_credit": 499 / 49900,
        },
        "year": {
            "price": 499 * 10,
            "credits": 49900 * 10,
            "price_per_credit": 499 * 10 / 49900 * 10,
        }
    },
}

# Agency Plans
AGENCY_PLANS = {
    "intro": {
        "month": {
            "price": 89,
            "credits": 2250,
            "price_per_credit": 89 / 2250,
        },
        "year": {
            "price": 89 * 10,
            "credits": 2250 * 10,
            "price_per_credit": 89 * 10 / 2250 * 10,
        }
    },
    "growth": {
        "month": {
            "price": 199,
            "credits": 12935,
            "price_per_credit": 199 / 12935,
        },
        "year": {
            "price": 199 * 10,
            "credits": 12935 * 10,
            "price_per_credit": 199 * 10 / 12935 * 10,
        }
    },
    "scale": {
        "month": {
            "price": 499,
            "credits": 37425,
            "price_per_credit": 499 / 37425,
        },
        "year": {
            "price": 499 * 10,
            "credits": 37425 * 10,
            "price_per_credit": 499 * 10 / 37425 * 10,
        }
    },
}

# Model pricing (credits per prompt)
MODEL_ID_PRICE_MAP = {
    "gpt-4o": 1,
    "chatgpt": 1,
    "sonar": 1,
    "google-ai-overview": 1,
    "llama-3-3-70b-instruct": 0.5,
    "gpt-4o-search": 1,
    "claude-sonnet-4": 2,
    "claude-3-5-haiku": 2,
    "gemini-1-5-flash": 1,
    "deepseek-r1": 1,
    "gemini-2-5-flash": 2,
    "google-ai-mode": 1,
    "grok-2-1212": 2,
    "gpt-3-5-turbo": 1,
}


## 3. Define Models

In [54]:
from typing import List, Literal, Optional

from pydantic import BaseModel, ConfigDict
from pydantic.alias_generators import to_camel


class CamelCaseModel(BaseModel):
    """Base model for camelCase to snake_case conversion"""

    model_config = ConfigDict(
        alias_generator=to_camel,
        populate_by_name=True,
    )


class Organization(CamelCaseModel):
    id: str
    company_id: str
    model_ids: List[str]
    prompt_limit: int
    prompts_count: int
    chat_interval_in_hours: int


class Company(CamelCaseModel):
    id: str
    name: str
    type: Literal["IN_HOUSE", "AGENCY", "PARTNER"]
    domain: Optional[str] = None
    stripe_customer_id: str
    stripe_subscription_id: str


class Recurring(CamelCaseModel):
    interval: Literal["month", "year"]
    interval_count: int


class SubscriptionPrice(CamelCaseModel):
    id: str
    unit_amount: int
    currency: str
    product: str
    recurring: Recurring


class SubscriptionItem(CamelCaseModel):
    id: str
    discounts: List[str] = None
    price: SubscriptionPrice


class ItemObj(CamelCaseModel):
    data: List[SubscriptionItem]


class Subscription(CamelCaseModel):
    id: str
    currency: Literal["eur", "usd"]
    customer: str

    items: ItemObj
    discounts: List[dict]


class Product(CamelCaseModel):
    id: str
    name: str
    prompt_limit: Optional[int] = None
    type: Optional[Literal["WORKSPACE", "MODELS", "PROMPTS"]] = None


class MigrationOutput(BaseModel):
    company_name: str
    company_domain: Optional[str]
    company_type: Literal["IN_HOUSE", "AGENCY", "PARTNER"]
    orgs_count: int
    orgs_count_hf: int
    current_mrr: int
    current_arr: int
    interval: str
    discount: int
    discounts: str
    prompt_usage: int
    prompt_capacity: int
    credits_usage: int
    credits_capacity: int
    plan_name: str
    mrr: int
    mrr_change: int
    arr_change: int
    extra_credits_purchased: int
    surplus_credits: int


print("✓ Models defined")


✓ Models defined


## 4. Load Data

In [55]:
def load_json(file_path: Path):
    """Load JSON and replace NaN with None."""
    df_raw = pd.read_json(file_path)
    records = df_raw.replace({float("nan"): None}).to_dict("records")
    return records

# Define data paths
base_path = Path.cwd().parent.parent
data_path = base_path / "data"

print(f"Loading data from: {data_path}")

# Load all data files
print("Loading source data...")
companies_raw = load_json(data_path / "processed_companies.json")
orgs_raw = load_json(data_path / "processed_organizations.json")
subs_raw = load_json(data_path / "stripe_subscriptions.json")
coupons_raw = load_json(data_path / "stripe_coupons.json")
products_raw = load_json(data_path / "stripe_products.json")

print(f"✓ Loaded {len(companies_raw)} companies")
print(f"✓ Loaded {len(orgs_raw)} organizations")
print(f"✓ Loaded {len(subs_raw)} subscription items")
print(f"✓ Loaded {len(coupons_raw)} coupons")
print(f"✓ Loaded {len(products_raw)} products")


Loading data from: /Users/matevz/dev/peec-ai/stripe-migration-analysis/data
Loading source data...
✓ Loaded 11769 companies
✓ Loaded 3389 organizations
✓ Loaded 1296 subscription items
✓ Loaded 48 coupons
✓ Loaded 55 products


## 5. Filter and Valdiate Data

In [56]:
companies_filtered = [
    c
    for c in companies_raw
    if c["stripeSubscriptionId"]
    and c["stripeCustomerId"]
    and c["stripeSubscriptionStatus"] == "active"
]

products_filtered = [p for p in products_raw if p["active"]]

product_mapped = [
    {
        "id": p["id"],
        "name": p["name"],
        "prompt_limit": int(p["metadata"].get("promptLimit"))
        if p["metadata"].get("type") == "WORKSPACE"
        else None,
        "type": p["metadata"].get("type")
        if p["metadata"] and p["metadata"].get("type")
        else None,
    }
    for p in products_raw
]

print(f"Filtered out {len(companies_raw) - len(companies_filtered)} companies")
print(f"Filtered out {len(products_raw) - len(products_filtered)} products")

# Validate with Pydantic models
companies = [Company.model_validate(c) for c in companies_filtered]
orgs = [Organization.model_validate(o) for o in orgs_raw]
products = [Product.model_validate(p) for p in product_mapped]
subs = [Subscription.model_validate(s) for s in subs_raw]

# Create coupon lookup
coupons_map = {c["id"]: c for c in coupons_raw}

print("✓ Data validated")


Filtered out 10507 companies
Filtered out 8 products
✓ Data validated


## 6. Create DataFrames

In [57]:
# Convert to DataFrames
companies_df = pd.DataFrame([c.model_dump() for c in companies])
orgs_df = pd.DataFrame([o.model_dump() for o in orgs])
subs_df = pd.DataFrame([s.model_dump() for s in subs])
products_df = pd.DataFrame([p.model_dump() for p in products])

print("DataFrames created:")
print(f"  Companies: {companies_df.shape}")
print(f"  Organizations: {orgs_df.shape}")
print(f"  Subscriptions: {subs_df.shape}")
print(f"  Products: {products_df.shape}")

DataFrames created:
  Companies: (1262, 6)
  Organizations: (3389, 6)
  Subscriptions: (1296, 5)
  Products: (55, 4)


In [39]:
companies_df.head()

Unnamed: 0,id,name,type,domain,stripe_customer_id,stripe_subscription_id
0,co_0066473d-9106-4e5f-b13c-7b756c207675,Flying Cat,AGENCY,flyingcatmarketing.com,cus_T6T5U4MOGZ7ntZ,sub_1SDr10KojVEYZPlXy4X5spim
1,co_00e9c907-6659-4829-9a93-558923266790,Wickey,IN_HOUSE,wickey.de,cus_T40lDnXfwWLTJe,sub_1SD0bmKojVEYZPlXA2382RPf
2,co_01766115-a8ce-40f5-8dc4-b391fcee3db0,Dot Dash,AGENCY,dotdash.io,cus_T2bSdUiWSeuKVX,sub_1SGhkrKojVEYZPlXgxccGh4K
3,co_018bcf89-5317-4104-88df-9f6e77a52276,TrueClicks,IN_HOUSE,trueclicks.com,cus_SytMawkXSQzCwo,sub_1S2vbmKojVEYZPlX79pt2DcB
4,co_01dbefdf-03bd-4788-90c3-8aeb11c359f7,CommsCo,AGENCY,thecommsco.com,cus_SlmRXxPfcvT5vJ,sub_1RqEu2KojVEYZPlX9mMIMKnT


In [40]:
products_df.head()

Unnamed: 0,id,name,prompt_limit,type
0,prod_TDvhozubflPlYk,Custom Enterprise Plan,,
1,prod_T7qiwyJq7wJzf8,Enterprise Project • 400 Prompts,300.0,WORKSPACE
2,prod_T62Qa3TDUIgPG6,Enterprise Project • 750 Prompts,750.0,WORKSPACE
3,prod_T3t3JRZv3GELfL,Pro Project • 75 Prompts,75.0,WORKSPACE
4,prod_T1py7RrHAcrCKu,Enterprise Add-on (300 Prompts) • Model • clau...,,MODELS


## 7. Calculate Credits Usage

In [42]:
def calculate_credits_usage(row: pd.Series) -> int:
    """Calculate required credits based on model usage and run frequency."""
    runs_per_month = 30
    model_prices = [MODEL_ID_PRICE_MAP.get(mid, 0) for mid in row["model_ids"]]
    return int(sum(model_prices) * row["prompts_count"] * runs_per_month)

def calculate_credits_capacity(row: pd.Series) -> int:
    """Calculate required credits based on prompt capacity and run frequency."""
    runs_per_month = 30
    model_prices = [MODEL_ID_PRICE_MAP.get(mid, 0) for mid in row["model_ids"]]
    return int(sum(model_prices) * row["prompt_limit"] * runs_per_month)

# Calculate credits for each organization
orgs_df["credits_usage"] = orgs_df.apply(calculate_credits_usage, axis=1)
orgs_df["credits_capacity"] = orgs_df.apply(calculate_credits_capacity, axis=1)

print("✓ Calculated credits for organizations")
orgs_df[["id", "company_id", "prompts_count", "prompt_limit", "credits_usage", "credits_capacity"]].head()

✓ Calculated credits for organizations


Unnamed: 0,id,company_id,prompts_count,prompt_limit,credits_usage,credits_capacity
0,20da1ff7-bed2-40e8-a5c0-cade5250e7ba,co_1fea122e-be87-47f8-b459-4bb426706d35,23,30,2760,3600
1,25f8bb17-0754-4840-ada6-40e7a9345f27,co_f5267b94-1922-4312-8e0d-b3b2b20864fa,21,25,2205,2625
2,28b0f80a-4e7a-4936-97b8-838150c78f70,co_ae8a374a-3893-4c21-857c-1bea3a469807,34,30,3060,2700
3,3d08ee7f-b5bd-4324-8524-f0f97ead5245,co_1f216996-4e82-46a6-9089-97c619ecf16c,10,120,1200,14400
4,4e838a3a-eb44-4378-bda1-c94de3357279,co_66c7c3ff-4038-4dcc-b225-89f7fc5e212a,33,55,6930,11550


## 8. Subscription Data

In [None]:
def calculate_purchased_capacity(row: pd.Series) -> int:
    return 0

subs_df["purchased_capacity"] = subs_df.apply(calculate_purchased_capacity, axis=1)

subs_df.head()

Unnamed: 0,id,currency,customer,items,discounts
0,sub_1SKI9JKojVEYZPlXDe8jLv3S,eur,cus_TGpoPN9VB2Fxpl,"{'data': [{'id': 'si_TGpovfaoL8c6Iq', 'discoun...","[{'id': 'di_1SKI8iKojVEYZPlXEW9Gbc6w', 'object..."
1,sub_1SKHoZKojVEYZPlXvLnfZEYo,eur,cus_ShBwH7nGMBOhQ4,"{'data': [{'id': 'si_TGpbfRb7cDQJOb', 'discoun...",[]
2,sub_1SKG0wKojVEYZPlXuwILNA1a,eur,cus_TGnaE64V7EYNxi,"{'data': [{'id': 'si_TGncbukr60WWVW', 'discoun...",[]
3,sub_1SJmRxKojVEYZPlXlmaMUOsV,usd,cus_TALZStXY0oOCLb,"{'data': [{'id': 'si_TGJ45JoabTm4f7', 'discoun...",[]
4,sub_1SJG0eKojVEYZPlXTCdNfkuq,eur,cus_TFlWYSRg3KZEAz,"{'data': [{'id': 'si_TFlXbtQ7wgX8gS', 'discoun...","[{'id': 'di_1SJFzlKojVEYZPlX36Jm3t0b', 'object..."


In [None]:
# Join subscription items with products
subs_with_product = subs_df.merge(
    products_df[["id", "prompt_limit"]],
    left_on="product",
    right_on="id",
    how="left",
    suffixes=("", "_product"),
).drop(columns=["id"])

# len of product is not NaN
print(f"Subs with product: {len(subs_with_product[subs_with_product['product'].notna()])}")
# len of product = NaN
print(f"Subs without product: {len(subs_with_product[subs_with_product['product'].isna()])}")

subs_with_product.head(15)

Subs with product: 1264
Subs without product: 279


Unnamed: 0,customer_id,subscription_id,subscription_item_id,plan_id,interval,interval_count,unit_amount,quantity,subscription_discounts,discounts,product,prompt_limit
0,cus_TGnaE64V7EYNxi,sub_1SKG0wKojVEYZPlXuwILNA1a,si_TGncbukr60WWVW,price_1RCIq2KojVEYZPlXwWt6ANiT,month,1,19900,1,[],[],prod_S6Vr6mr8BJrWGc,100.0
1,cus_TALZStXY0oOCLb,sub_1SJmRxKojVEYZPlXlmaMUOsV,si_TGJ45JoabTm4f7,price_1RQ933KojVEYZPlXctNZu1ym,month,1,8900,1,[],[],prod_S6Vq3DJcPoXe3i,25.0
2,cus_TFlWYSRg3KZEAz,sub_1SJG0eKojVEYZPlXTCdNfkuq,si_TFlXbtQ7wgX8gS,price_1RCIq2KojVEYZPlXwWt6ANiT,month,1,19900,1,[1EISV8ne],[],prod_S6Vr6mr8BJrWGc,100.0
3,cus_TFkTQG75S035eD,sub_1SJEz8KojVEYZPlXWabkGBw4,si_TFl7XB6h88cshT,price_1SJFatKojVEYZPlXejAYwrp0,month,1,4975,4,[1EISV8ne],[],,
4,cus_TFkI1wlz77YHf3,sub_1SJEo4KojVEYZPlX5OK3mrI8,si_TGkePmHAdwR5Qg,price_1RT6IWKojVEYZPlXATN2Aqfh,month,1,9950,2,[],[],prod_SAzUvCNnzl6n71,50.0
5,cus_TFjxft9EK4oHF5,sub_1SJEUZKojVEYZPlX7fCcdWs6,si_TFjy9EMIU0lZLz,price_1RQ933KojVEYZPlXctNZu1ym,month,1,8900,1,[1EISV8ne],[],prod_S6Vq3DJcPoXe3i,25.0
6,cus_TFjnL9EnCnie5l,sub_1SJEJbKojVEYZPlXY5pEzgn6,si_TFjn9aIDDncIoG,price_1SIX30KojVEYZPlX7pvrqelA,month,1,57893,1,[],[],prod_SJLngGx2JOogoI,300.0
7,cus_TFjnL9EnCnie5l,sub_1SJEJbKojVEYZPlXY5pEzgn6,si_TFlOXKoQCIdWoX,price_1SJFrWKojVEYZPlXUy6S61dB,month,1,0,8,[],[],,
8,cus_TFjnL9EnCnie5l,sub_1SJEJbKojVEYZPlXY5pEzgn6,si_TFlOwJ0025piK2,price_1SJFrWKojVEYZPlXhjia0TDg,month,1,0,2,[],[],,
9,cus_TFiW0oljAK4r0J,sub_1SJD5CKojVEYZPlXFtGgz17q,si_TGnQ6LFeG4KOt4,price_1SKFphKojVEYZPlXMKZGZUsy,month,1,11000,4,[1EISV8ne],[],,


## 9. Capacity Data

In [81]:
company_stats_fs = (
    orgs_df.groupby("company_id")
    .agg(
        prompts_count=("prompts_count", "sum"),
        prompt_limit=("prompt_limit", "sum"),
        credits_usage=("credits_usage", "sum"),
        credits_capacity=("credits_capacity", "sum"),
    )
    .reset_index()
)

company_stats_fs.head()

Unnamed: 0,company_id,prompts_count,prompt_limit,credits_usage,credits_capacity
0,co_004d676c-c61c-4888-bcf8-6ef606a156ed,6,25,540,2250
1,co_0066473d-9106-4e5f-b13c-7b756c207675,197,505,17730,45450
2,co_00a903f4-7115-4770-bde3-6e8eb9982243,25,25,2250,2250
3,co_00e9c907-6659-4829-9a93-558923266790,100,100,9000,9000
4,co_00f466ea-1612-4546-9292-ffc03d029c2e,11,25,990,2250


In [82]:
company_stats_stripe = (
    subs_with_product.groupby("customer_id")
    .agg(purchased_capacity=("prompt_limit", "sum"))
    .reset_index()
)

company_stats_stripe.sort_values(by="purchased_capacity", ascending=False).head()

Unnamed: 0,customer_id,purchased_capacity
611,cus_SnrUb7yvjBigBm,2000.0
1018,cus_T62FnIQ8UbNr1U,750.0
163,cus_SODf2O6AF4Y79O,700.0
352,cus_SdQHd8aAE4dqBx,500.0
1106,cus_T7wySch0wevxMW,500.0


In [87]:
# sub_1SCzWiKojVEYZPlXfOvGmgxT
subs_with_product[subs_with_product["subscription_id"] == "sub_1SCzWiKojVEYZPlXfOvGmgxT"]


Unnamed: 0,customer_id,subscription_id,subscription_item_id,plan_id,interval,interval_count,unit_amount,quantity,subscription_discounts,discounts,product,prompt_limit
232,cus_T9I40LbIKjNq4N,sub_1SCzWiKojVEYZPlXfOvGmgxT,si_T9I6mTM4tOM0HX,price_1RnMl7KojVEYZPlXpdBSSuU9,month,1,150000,1,[WPWM5feT],[],prod_SRfFIFqKBnTrlh,
233,cus_T9I40LbIKjNq4N,sub_1SCzWiKojVEYZPlXfOvGmgxT,si_T9IPLfuNFMeAWx,price_1SCzolKojVEYZPlXUvyXdI7I,month,1,0,4,[WPWM5feT],[],,
234,cus_T9I40LbIKjNq4N,sub_1SCzWiKojVEYZPlXfOvGmgxT,si_T9IPoaq2vPKCE9,price_1SCzolKojVEYZPlXpCuAHrPn,month,1,0,4,[WPWM5feT],[],,


In [85]:
company_stats = companies_df.merge(
    company_stats_fs,
    left_on="id",
    right_on="company_id",
    how="left",
    suffixes=("", "_fs"),
).drop(columns=["company_id"])

company_stats = company_stats.merge(
    company_stats_stripe,
    left_on="stripe_customer_id",
    right_on="customer_id",
    how="left",
    suffixes=("", "_stripe"),
).drop(columns=["customer_id"])

company_stats.sort_values(by="credits_capacity", ascending=False).head(10)


Unnamed: 0,id,name,type,domain,stripe_customer_id,stripe_subscription_id,prompts_count,prompt_limit,credits_usage,credits_capacity,purchased_capacity
1140,co_f2f15e87-d509-4ddf-b0af-9b6aaedc467a,primelis,AGENCY,primelis.com,cus_Sjq2VQ1HsswE74,sub_1RoMcHKojVEYZPlXSYGYtZ9O,2848.0,9824.0,320460.0,1121130.0,0.0
1007,co_d76ea89d-7e41-4dcb-853a-ce6e13f6e6fe,Seer Interactive,AGENCY,seerinteractive.com,cus_SMja2jl2tfEgOu,sub_1RS099KojVEYZPlXFfgXNbyb,2603.0,5050.0,249270.0,484500.0,0.0
867,co_b80df044-79f6-42c3-9874-5f67f18dd6a1,Globant,AGENCY,globant.com,cus_SOaE0sWneP9UM1,sub_1RTn4gKojVEYZPlXMcSOIxws,3830.0,4200.0,344700.0,378000.0,0.0
101,co_14b5e3c5-484d-47c0-ad14-d8da057ee787,Mito,AGENCY,mito.hu,cus_SjsMER7unqgZkj,sub_1RqwojKojVEYZPlXwFW5rYZx,1586.0,2525.0,155790.0,245250.0,0.0
426,co_5acd0425-e44f-4c42-8e32-7bc91d7460ff,Growth Plays,AGENCY,growthplays.com,cus_SBWWynndgSlFcq,sub_1RH9TAKojVEYZPlXEclydvvj,1199.0,1575.0,182670.0,237000.0,425.0
82,co_0f649f88-35b0-401c-8fd2-fe9397f1c2bb,Advice Interactive,AGENCY,adviceinteractive.com,cus_SnrUb7yvjBigBm,sub_1RsG1tKojVEYZPlX4GOaya49,1981.0,2050.0,178290.0,184500.0,2000.0
903,co_c289f71e-7309-4007-b8da-a5ff61d405bc,Nerdoptimize,AGENCY,nerdoptimize.com,cus_T9I40LbIKjNq4N,sub_1SCzWiKojVEYZPlXfOvGmgxT,1660.0,2000.0,149400.0,180000.0,0.0
975,co_d29d5af1-a858-4175-9b2a-34dce2590746,TIpi Group,AGENCY,tipigroup.com,cus_S2l0Z95ldGbZrp,sub_1R8fUlKojVEYZPlXWxkgAQLw,1573.0,1630.0,141570.0,146700.0,0.0
63,co_0c8ec0dd-1345-4b98-aae4-aa3216b77692,Create Group,AGENCY,creategroup.me,cus_STNbyxiwmFTzbH,sub_1SFE5DKojVEYZPlX6TSBuNAJ,25.0,1600.0,2250.0,144000.0,
260,co_3b72236b-8e31-40a4-a1e8-0e764d0a66a7,Peak Ace,AGENCY,peakace.agency,cus_RkVsLVkfHVfbgw,sub_1Qr0qTKojVEYZPlXHvzPi4Za,1020.0,1115.0,125670.0,134250.0,0.0
