In [1]:
import pandas as pd
import seaborn as sns
import plotly
import matplotlib.pyplot as plt
import datetime

In [2]:
df = pd.read_csv("../data/subscriptions.csv")
pay_df = pd.read_csv("../data/payments.csv")

email = 'Customer Email'
name = 'Customer Name'
pay_df['cust_id'] = pay_df['Name'] + '-' + pay_df['Email']
df['cust_id'] = df[name] + '-' + df[email]


In [3]:
sos = pay_df[['cust_id', 'Total Spend']].to_dict(orient = 'tight', index=False)
meow = {i[0]: i[1] for i in sos['data']}
df['spent'] = df['cust_id'].map(meow)
df = df[df['spent'] >= 60]

In [4]:
df.shape

(205, 19)

In [5]:
def is_active(row, month):
    started_before = row['Start Date (UTC)'] <= month.start_time
    ended_after = row['Canceled At (UTC)'] >= month.start_time or \
                  pd.isnull(row['Canceled At (UTC)'])
    
    return started_before and ended_after

In [6]:
# Convert relevant date columns to datetime
df['Start Date (UTC)'] = pd.to_datetime(df['Start Date (UTC)'])
df['Canceled At (UTC)'] = pd.to_datetime(df['Canceled At (UTC)'])

# Extract the month and year
df['start_month'] = df['Start Date (UTC)'].dt.to_period('M')
df['cancel_month'] = df['Canceled At (UTC)'].dt.to_period('M')


In [9]:
df[
    (df['cancel_month'].dt.year == 2024) &
    (df['cancel_month'].dt.month == 6) 
][[name, email, 'Amount']]

Unnamed: 0,Customer Name,Customer Email,Amount
203,Karim Kawash,kkawashk@gmail.com,150.0
213,Rajesh Ravindran,rajeshravindran05@gmail.com,80.0
214,J Chen,christinachen0130@hotmail.com,80.0
224,Alina Leon Kozenko,alileoko97@hotmail.com,150.0
241,Avinoam Gal,mertiti@gmail.com,129.0
246,Mrs Prita V Nair,nairprita21@gmail.com,60.0
253,S Markuta,ostaf05sveta@yahoo.com,99.0
264,Ankur Zadoo,ankurzadu@gmail.com,99.0
265,Ankur Zadoo,ankurzadu@gmail.com,99.0
268,Ying Liu,l9664@hotmail.com,60.0


In [10]:
analysis_range = {
    "start": df['start_month'].min(),
    "end": df['start_month'].max()
}
# analysis_range
all_months = pd.period_range(analysis_range["start"], analysis_range["end"])
#all_months

In [13]:
all_months

PeriodIndex(['2023-09', '2023-10', '2023-11', '2023-12', '2024-01', '2024-02',
             '2024-03', '2024-04', '2024-05', '2024-06', '2024-07', '2024-08',
             '2024-09', '2024-10', '2024-11', '2024-12', '2025-01', '2025-02',
             '2025-03', '2025-04', '2025-05', '2025-06', '2025-07'],
            dtype='period[M]')

In [11]:
start_hist = df.start_month.value_counts().sort_index()
first = pd.Series(index = [pd.to_datetime('2023-09').to_period('M')], data=[0])
cancel_hist = pd.concat([first, df.cancel_month.value_counts()]).sort_index()

In [10]:
actives = []
for month in start_hist.index:
    active_amount = df.apply(is_active, args=[month], axis=1)
    actives.append(active_amount.sum())

In [19]:
 df.cancel_month.value_counts().shape

(19,)

In [17]:
len(actives)

23

In [11]:
counts = pd.DataFrame({"idx": start_hist.index,
                       "start_count": start_hist.values,
                       "cancel_count": cancel_hist.values,
                       "actives":actives})

ValueError: All arrays must be of the same length

In [None]:
counts['churn'] = (counts["cancel_count"] / counts["actives"])

In [None]:
plt.plot(counts["idx"].astype(str),counts["cancel_count"], marker='o', linestyle='dashed',color = 'r', label="cancel")
plt.plot(counts["idx"].astype(str),counts["start_count"], marker='o', linestyle='dashed',color = 'b', label="started")
plt.plot(counts["idx"].astype(str),counts["actives"], marker='o', linestyle='dashed',color = 'g', label="active")
plt.legend()
# plt.xlabel("treatment_start_year")
# plt.ylabel("waiting_duration_weeks")
# plt.title("treatment_start_year X waiting_duration_weeks")
plt.show()

In [None]:
counts.describe()