In [458]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [459]:
billingRaw = pd.read_csv('../datasets/billing.csv')
eventsRaw = pd.read_csv('../datasets/events.csv')
sessionsRaw = pd.read_csv('../datasets/sessions.csv')
usersRaw = pd.read_csv('../datasets/users.csv')

In [460]:
billingRaw[:5]

Unnamed: 0,user_id,month,plan_tier,active_seats,mrr,discount_applied,invoices_overdue,support_ticket_count
0,f94d1824-8742-4000-8b6d-39d70958490b,2024-05,free,2,0.0,0,0,0
1,f94d1824-8742-4000-8b6d-39d70958490b,2024-06,free,2,0.0,0,0,0
2,f94d1824-8742-4000-8b6d-39d70958490b,2024-07,free,1,0.0,0,0,0
3,f94d1824-8742-4000-8b6d-39d70958490b,2024-08,free,1,0.0,0,0,0
4,f94d1824-8742-4000-8b6d-39d70958490b,2024-09,free,1,0.0,0,0,0


In [461]:
billingRaw.loc[billingRaw['plan_tier'] == 'free', billingRaw.columns[2]] = 0
billingRaw.loc[billingRaw['plan_tier'] == 'standard', billingRaw.columns[2]] = 1
billingRaw.loc[billingRaw['plan_tier'] == 'premium', billingRaw.columns[2]] = 2
billingRaw['plan_tier'] = pd.to_numeric(billingRaw['plan_tier'], errors='coerce')

In [462]:
billingRaw[:5]

Unnamed: 0,user_id,month,plan_tier,active_seats,mrr,discount_applied,invoices_overdue,support_ticket_count
0,f94d1824-8742-4000-8b6d-39d70958490b,2024-05,0,2,0.0,0,0,0
1,f94d1824-8742-4000-8b6d-39d70958490b,2024-06,0,2,0.0,0,0,0
2,f94d1824-8742-4000-8b6d-39d70958490b,2024-07,0,1,0.0,0,0,0
3,f94d1824-8742-4000-8b6d-39d70958490b,2024-08,0,1,0.0,0,0,0
4,f94d1824-8742-4000-8b6d-39d70958490b,2024-09,0,1,0.0,0,0,0


In [463]:
billingRaw['month'] = pd.to_datetime(billingRaw['month'], format = '%Y-%m')

In [464]:
type(billingRaw['month'])

pandas.core.series.Series

In [465]:
billingRaw

Unnamed: 0,user_id,month,plan_tier,active_seats,mrr,discount_applied,invoices_overdue,support_ticket_count
0,f94d1824-8742-4000-8b6d-39d70958490b,2024-05-01,0,2,0.0,0,0,0
1,f94d1824-8742-4000-8b6d-39d70958490b,2024-06-01,0,2,0.0,0,0,0
2,f94d1824-8742-4000-8b6d-39d70958490b,2024-07-01,0,1,0.0,0,0,0
3,f94d1824-8742-4000-8b6d-39d70958490b,2024-08-01,0,1,0.0,0,0,0
4,f94d1824-8742-4000-8b6d-39d70958490b,2024-09-01,0,1,0.0,0,0,0
...,...,...,...,...,...,...,...,...
1000171,2bebcb3d-bead-492a-97e3-1346173ee638,2025-03-01,0,4,0.0,0,0,0
1000172,2bebcb3d-bead-492a-97e3-1346173ee638,2025-04-01,0,2,0.0,0,0,0
1000173,2bebcb3d-bead-492a-97e3-1346173ee638,2025-05-01,0,2,0.0,0,0,0
1000174,2bebcb3d-bead-492a-97e3-1346173ee638,2025-06-01,0,3,0.0,0,0,0


In [466]:
# pd.plotting.scatter_matrix(billingRaw, alpha=0.8, figsize=(16, 16), diagonal='hist');

In [467]:
upgrade_df = billingRaw.groupby('user_id')['plan_tier'].agg(['first', 'last']).reset_index()
upgrade_df['upgrade'] = (upgrade_df['last'] > upgrade_df['first']).astype(int)
upgrade_df.loc[upgrade_df['last'] < upgrade_df['first'], 'upgrade'] = -1
upgrade_df = upgrade_df.drop(['first', 'last'], axis = 1)

In [468]:
upgrade_df

Unnamed: 0,user_id,upgrade
0,0000093e-7258-43c0-b212-ea7212795ddf,0
1,0003c565-9352-45aa-a448-82b91e57b043,0
2,00044174-f9c1-4412-b9ca-584a21903f92,0
3,0005523a-ffde-4fe8-96d8-c203b1fe2f97,0
4,0006fdbd-c35d-4c0d-a948-e8ee10f279d5,0
...,...,...
99995,fffa9cdd-f19a-4849-ae3b-2302b5b68020,0
99996,fffcb715-b3de-4a0e-813a-df5dd2f833d2,0
99997,fffce54a-cd86-4f93-b529-d92586690bd5,0
99998,fffe77cc-44fd-401f-8fdd-9fa06afc6bc8,0


In [469]:
upgrade_df['upgrade'].value_counts()

upgrade
 0    95170
 1     2848
-1     1982
Name: count, dtype: int64

In [470]:
discount_df = billingRaw.groupby("user_id")["discount_applied"].max().reset_index().drop('user_id', axis = 1)
upgrade_df = pd.concat([upgrade_df, discount_df], axis = 1)

In [471]:
upgrade_df

Unnamed: 0,user_id,upgrade,discount_applied
0,0000093e-7258-43c0-b212-ea7212795ddf,0,0
1,0003c565-9352-45aa-a448-82b91e57b043,0,0
2,00044174-f9c1-4412-b9ca-584a21903f92,0,0
3,0005523a-ffde-4fe8-96d8-c203b1fe2f97,0,0
4,0006fdbd-c35d-4c0d-a948-e8ee10f279d5,0,0
...,...,...,...
99995,fffa9cdd-f19a-4849-ae3b-2302b5b68020,0,0
99996,fffcb715-b3de-4a0e-813a-df5dd2f833d2,0,0
99997,fffce54a-cd86-4f93-b529-d92586690bd5,0,0
99998,fffe77cc-44fd-401f-8fdd-9fa06afc6bc8,0,1


In [472]:
seats_df = billingRaw.groupby('user_id').tail(1)[['user_id', 'active_seats']].drop_duplicates('user_id')
upgrade_df = upgrade_df.merge(seats_df, on='user_id', how='left')

In [473]:
upgrade_df

Unnamed: 0,user_id,upgrade,discount_applied,active_seats
0,0000093e-7258-43c0-b212-ea7212795ddf,0,0,3
1,0003c565-9352-45aa-a448-82b91e57b043,0,0,3
2,00044174-f9c1-4412-b9ca-584a21903f92,0,0,6
3,0005523a-ffde-4fe8-96d8-c203b1fe2f97,0,0,2
4,0006fdbd-c35d-4c0d-a948-e8ee10f279d5,0,0,4
...,...,...,...,...
99995,fffa9cdd-f19a-4849-ae3b-2302b5b68020,0,0,11
99996,fffcb715-b3de-4a0e-813a-df5dd2f833d2,0,0,6
99997,fffce54a-cd86-4f93-b529-d92586690bd5,0,0,21
99998,fffe77cc-44fd-401f-8fdd-9fa06afc6bc8,0,1,46


In [474]:
invoices_df = billingRaw.groupby('user_id').tail(1)[['user_id', 'invoices_overdue']].drop_duplicates('user_id')
upgrade_df = upgrade_df.merge(invoices_df, on='user_id', how='left')

In [475]:
upgrade_df

Unnamed: 0,user_id,upgrade,discount_applied,active_seats,invoices_overdue
0,0000093e-7258-43c0-b212-ea7212795ddf,0,0,3,0
1,0003c565-9352-45aa-a448-82b91e57b043,0,0,3,0
2,00044174-f9c1-4412-b9ca-584a21903f92,0,0,6,0
3,0005523a-ffde-4fe8-96d8-c203b1fe2f97,0,0,2,0
4,0006fdbd-c35d-4c0d-a948-e8ee10f279d5,0,0,4,0
...,...,...,...,...,...
99995,fffa9cdd-f19a-4849-ae3b-2302b5b68020,0,0,11,0
99996,fffcb715-b3de-4a0e-813a-df5dd2f833d2,0,0,6,0
99997,fffce54a-cd86-4f93-b529-d92586690bd5,0,0,21,0
99998,fffe77cc-44fd-401f-8fdd-9fa06afc6bc8,0,1,46,0


In [476]:
support_df = billingRaw.groupby('user_id').tail(1)[['user_id', 'support_ticket_count']].drop_duplicates('user_id')
upgrade_df = upgrade_df.merge(support_df, on='user_id', how='left')

In [477]:
upgrade_df

Unnamed: 0,user_id,upgrade,discount_applied,active_seats,invoices_overdue,support_ticket_count
0,0000093e-7258-43c0-b212-ea7212795ddf,0,0,3,0,0
1,0003c565-9352-45aa-a448-82b91e57b043,0,0,3,0,1
2,00044174-f9c1-4412-b9ca-584a21903f92,0,0,6,0,0
3,0005523a-ffde-4fe8-96d8-c203b1fe2f97,0,0,2,0,0
4,0006fdbd-c35d-4c0d-a948-e8ee10f279d5,0,0,4,0,0
...,...,...,...,...,...,...
99995,fffa9cdd-f19a-4849-ae3b-2302b5b68020,0,0,11,0,0
99996,fffcb715-b3de-4a0e-813a-df5dd2f833d2,0,0,6,0,0
99997,fffce54a-cd86-4f93-b529-d92586690bd5,0,0,21,0,0
99998,fffe77cc-44fd-401f-8fdd-9fa06afc6bc8,0,1,46,0,1


In [478]:
x = upgrade_df.iloc[:, 2:6]
y = upgrade_df.iloc[:, 1]

xtrain, xtest, ytrain, ytest = train_test_split(x, y, stratify=y, random_state = 0)
model = LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced')
model.fit(xtrain, ytrain)
ypred = model.predict(xtest)

print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

          -1       0.05      0.27      0.08       496
           0       0.97      0.66      0.79     23792
           1       0.06      0.53      0.11       712

    accuracy                           0.65     25000
   macro avg       0.36      0.49      0.33     25000
weighted avg       0.93      0.65      0.75     25000



In [479]:
model.coef_

array([[ 0.38280161,  0.00898185, -0.05104691, -0.35145123],
       [-0.9142427 , -0.0016    ,  0.0045006 ,  0.11936181],
       [ 0.53144109, -0.00738184,  0.0465463 ,  0.23208942]])

In [480]:
accuracy_score(ytest, ypred)

0.64896

In [481]:
billingRaw['plan_tier_shift'] = billingRaw.groupby('user_id')['plan_tier'].shift()
billingRaw['tier_change'] = billingRaw['plan_tier'] != billingRaw['plan_tier_shift']
first_rows_idx = billingRaw.groupby('user_id').head(1).index
billingRaw.loc[first_rows_idx, 'tier_change'] = False

first_change_prev_idx = (
    billingRaw[billingRaw['tier_change']]
      .groupby('user_id')
      .head(1)
      .index - 1
)

billingRaw['seats'] = np.nan
billingRaw.loc[first_change_prev_idx, 'seats'] = billingRaw.loc[first_change_prev_idx, 'active_seats']

no_change_mask = billingRaw.groupby('user_id')['tier_change'].transform('sum').eq(0)
last_rows_no_change = billingRaw[no_change_mask].groupby('user_id').tail(1)
billingRaw.loc[last_rows_no_change.index, 'seats'] = last_rows_no_change['active_seats']

seats_before = (
    billingRaw.dropna(subset=['seats'])
              .drop_duplicates('user_id')[['user_id', 'seats']]
              .reset_index(drop=True)
)

In [482]:
upgrade_df = upgrade_df.merge(seats_before, on='user_id', how='left')
upgrade_df = upgrade_df.drop('active_seats', axis = 1)

In [483]:
upgrade_df

Unnamed: 0,user_id,upgrade,discount_applied,invoices_overdue,support_ticket_count,seats
0,0000093e-7258-43c0-b212-ea7212795ddf,0,0,0,0,3.0
1,0003c565-9352-45aa-a448-82b91e57b043,0,0,0,1,3.0
2,00044174-f9c1-4412-b9ca-584a21903f92,0,0,0,0,6.0
3,0005523a-ffde-4fe8-96d8-c203b1fe2f97,0,0,0,0,2.0
4,0006fdbd-c35d-4c0d-a948-e8ee10f279d5,0,0,0,0,4.0
...,...,...,...,...,...,...
99995,fffa9cdd-f19a-4849-ae3b-2302b5b68020,0,0,0,0,11.0
99996,fffcb715-b3de-4a0e-813a-df5dd2f833d2,0,0,0,0,6.0
99997,fffce54a-cd86-4f93-b529-d92586690bd5,0,0,0,0,21.0
99998,fffe77cc-44fd-401f-8fdd-9fa06afc6bc8,0,1,0,1,46.0


In [484]:
x = upgrade_df.iloc[:, 2:6]
y = upgrade_df.iloc[:, 1]

xtrain, xtest, ytrain, ytest = train_test_split(x, y, stratify=y, random_state = 0)
model = LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced')
model.fit(xtrain, ytrain)
ypred = model.predict(xtest)

print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

          -1       0.05      0.28      0.08       496
           0       0.97      0.66      0.79     23792
           1       0.06      0.54      0.11       712

    accuracy                           0.65     25000
   macro avg       0.36      0.49      0.33     25000
weighted avg       0.93      0.65      0.75     25000



In [485]:
model.coef_

array([[ 0.38331318, -0.04997115, -0.34656803,  0.00914277],
       [-0.91307359,  0.0050237 ,  0.09854433, -0.00097921],
       [ 0.52976041,  0.04494746,  0.2480237 , -0.00816357]])

In [486]:
accuracy_score(ytest, ypred)

0.65044

In [487]:
billingRaw['plan_tier_shift'] = billingRaw.groupby('user_id')['plan_tier'].shift()
billingRaw['tier_change'] = billingRaw['plan_tier'] != billingRaw['plan_tier_shift']
first_rows_idx = billingRaw.groupby('user_id').head(1).index
billingRaw.loc[first_rows_idx, 'tier_change'] = False

first_change_prev_idx = (
    billingRaw[billingRaw['tier_change']]
      .groupby('user_id')
      .head(1)
      .index - 1
)

billingRaw['support_tickets'] = np.nan
billingRaw.loc[first_change_prev_idx, 'support_tickets'] = billingRaw.loc[first_change_prev_idx, 'support_ticket_count']

no_change_mask = billingRaw.groupby('user_id')['tier_change'].transform('sum').eq(0)
last_rows_no_change = billingRaw[no_change_mask].groupby('user_id').tail(1)
billingRaw.loc[last_rows_no_change.index, 'support_tickets'] = last_rows_no_change['support_ticket_count']

support_tickets_before = (
    billingRaw.dropna(subset=['support_tickets'])
              .drop_duplicates('user_id')[['user_id', 'support_tickets']]
              .reset_index(drop=True)
)

In [488]:
upgrade_df = upgrade_df.merge(support_tickets_before, on='user_id', how='left')
upgrade_df = upgrade_df.drop('support_ticket_count', axis = 1)

In [520]:
upgrade_df

Unnamed: 0,user_id,upgrade,discount_applied,invoices_overdue,seats,support_tickets
0,0000093e-7258-43c0-b212-ea7212795ddf,0,0,0,3.0,0.0
1,0003c565-9352-45aa-a448-82b91e57b043,0,0,0,3.0,1.0
2,00044174-f9c1-4412-b9ca-584a21903f92,0,0,0,6.0,0.0
3,0005523a-ffde-4fe8-96d8-c203b1fe2f97,0,0,0,2.0,0.0
4,0006fdbd-c35d-4c0d-a948-e8ee10f279d5,0,0,0,4.0,0.0
...,...,...,...,...,...,...
99995,fffa9cdd-f19a-4849-ae3b-2302b5b68020,0,0,0,11.0,0.0
99996,fffcb715-b3de-4a0e-813a-df5dd2f833d2,0,0,0,6.0,0.0
99997,fffce54a-cd86-4f93-b529-d92586690bd5,0,0,0,21.0,0.0
99998,fffe77cc-44fd-401f-8fdd-9fa06afc6bc8,0,1,0,46.0,1.0


In [522]:
x = upgrade_df.iloc[:, 2:6]
y = upgrade_df.iloc[:, 1]

xtrain, xtest, ytrain, ytest = train_test_split(x, y, stratify=y, random_state = 0)
model = LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced')
model.fit(xtrain, ytrain)
ypred = model.predict(xtest)

print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

          -1       0.04      0.27      0.07       496
           0       0.97      0.67      0.79     23792
           1       0.07      0.51      0.12       712

    accuracy                           0.66     25000
   macro avg       0.36      0.48      0.33     25000
weighted avg       0.93      0.66      0.76     25000



In [524]:
model.coef_

array([[ 3.29264936e-01, -3.45444722e-02,  2.93961680e-04,
         1.37757852e-01],
       [-8.93387832e-01,  1.11883251e-04,  3.94891731e-04,
        -3.28147626e-03],
       [ 5.64122896e-01,  3.44325889e-02, -6.88853411e-04,
        -1.34476375e-01]])

In [526]:
accuracy_score(ytest, ypred)

0.65736

In [532]:
nullpred = np.zeros(25000)
accuracy_score(nullpred, ypred)

0.65476

In [538]:
activeUpgrades = upgrade_df[upgrade_df['upgrade'] != 0]
activeUpgrades

Unnamed: 0,user_id,upgrade,discount_applied,invoices_overdue,seats,support_tickets
62,0031b10d-a3fb-4fa6-a27b-ea1ef8d5131b,-1,0,0,8.0,0.0
66,00330be0-cfe8-4ee9-9a88-9e926b9d8ec2,1,1,0,5.0,1.0
81,0046c527-3d82-4cde-9037-c0416584d506,1,0,0,6.0,0.0
95,00503b30-79b8-4aed-aeb5-a91aca3053be,-1,0,0,117.0,2.0
121,0064057d-27b5-4cd8-9595-eba825d272b8,1,1,0,5.0,0.0
...,...,...,...,...,...,...
99822,ff844a5d-2fea-4218-9a7c-aee980e53b2d,-1,1,0,21.0,0.0
99835,ff8bbc6d-6b53-4cfb-89ce-d3eeed8fb238,1,1,0,1.0,0.0
99848,ff96f424-3f6a-4dd4-b865-f31ed17ec71f,-1,0,0,25.0,2.0
99954,ffd9677d-41cc-47ab-8e9d-5c49c10c6548,-1,0,0,356.0,2.0


In [540]:
x = activeUpgrades.iloc[:, 2:6]
y = activeUpgrades.iloc[:, 1]

xtrain, xtest, ytrain, ytest = train_test_split(x, y, stratify=y, random_state = 0)
model = LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced')
model.fit(xtrain, ytrain)
ypred = model.predict(xtest)

print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

          -1       0.58      0.38      0.46       496
           1       0.65      0.81      0.72       712

    accuracy                           0.63      1208
   macro avg       0.61      0.59      0.59      1208
weighted avg       0.62      0.63      0.61      1208



In [542]:
model.coef_

array([[ 0.12519259, -0.06516563, -0.00129051, -0.26985275]])

In [544]:
accuracy_score(ytest, ypred)

0.6316225165562914

In [550]:
nullpred = np.ones(1208)
accuracy_score(nullpred, ypred)

0.7326158940397351

In [554]:
posUpgrades = upgrade_df
posUpgrades['upgrade'] = np.where(posUpgrades['upgrade'] == -1, 0, posUpgrades['upgrade'])

In [558]:
posUpgrades['upgrade'].unique()

array([0, 1])

In [None]:
x = posUpgrades.iloc[:, 2:6]
y = activeUpgrades.iloc[:, 1]

xtrain, xtest, ytrain, ytest = train_test_split(x, y, stratify=y, random_state = 0)
model = LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced')
model.fit(xtrain, ytrain)
ypred = model.predict(xtest)

print(classification_report(ytest, ypred))