## Importing Packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import statsmodels.api as sm

## Importing Datasets

In [4]:
billingRaw = pd.read_csv('../datasets/billing.csv')
eventsRaw = pd.read_csv('../datasets/events.csv')
sessionsRaw = pd.read_csv('../datasets/sessions.csv')
usersRaw = pd.read_csv('../datasets/users.csv')

## Plan tiers to numeric

In [6]:
billingRaw.loc[billingRaw['plan_tier'] == 'free', billingRaw.columns[2]] = 0
billingRaw.loc[billingRaw['plan_tier'] == 'standard', billingRaw.columns[2]] = 1
billingRaw.loc[billingRaw['plan_tier'] == 'premium', billingRaw.columns[2]] = 2
billingRaw['plan_tier'] = pd.to_numeric(billingRaw['plan_tier'], errors='coerce')

## Converting to datetime

In [8]:
billingRaw['month'] = pd.to_datetime(billingRaw['month'], format = '%Y-%m')
type(billingRaw['month'])

pandas.core.series.Series

## Condensed Feature Engineering
Full thought process on billingEDA.ipynb

In [10]:
upgrade_df = billingRaw.groupby('user_id')['plan_tier'].agg(['first', 'last']).reset_index()
upgrade_df['upgrade'] = (upgrade_df['last'] > upgrade_df['first']).astype(int)
upgrade_df.loc[upgrade_df['last'] < upgrade_df['first'], 'upgrade'] = -1
upgrade_df = upgrade_df.drop(['first', 'last'], axis = 1)
discount_df = billingRaw.groupby("user_id")["discount_applied"].max().reset_index().drop('user_id', axis = 1)
upgrade_df = pd.concat([upgrade_df, discount_df], axis = 1)
seats_df = billingRaw.groupby('user_id').tail(1)[['user_id', 'active_seats']].drop_duplicates('user_id')



upgrade_df = upgrade_df.merge(seats_df, on='user_id', how='left')
support_df = billingRaw.groupby('user_id').tail(1)[['user_id', 'support_ticket_count']].drop_duplicates('user_id')
upgrade_df = upgrade_df.merge(support_df, on='user_id', how='left')
billingRaw['plan_tier_shift'] = billingRaw.groupby('user_id')['plan_tier'].shift()
billingRaw['tier_change'] = billingRaw['plan_tier'] != billingRaw['plan_tier_shift']
first_rows_idx = billingRaw.groupby('user_id').head(1).index
billingRaw.loc[first_rows_idx, 'tier_change'] = False

first_change_prev_idx = (
    billingRaw[billingRaw['tier_change']]
      .groupby('user_id')
      .head(1)
      .index - 1
)

billingRaw['seats'] = np.nan
billingRaw.loc[first_change_prev_idx, 'seats'] = billingRaw.loc[first_change_prev_idx, 'active_seats']

no_change_mask = billingRaw.groupby('user_id')['tier_change'].transform('sum').eq(0)
last_rows_no_change = billingRaw[no_change_mask].groupby('user_id').tail(1)
billingRaw.loc[last_rows_no_change.index, 'seats'] = last_rows_no_change['active_seats']

seats_before = (
    billingRaw.dropna(subset=['seats'])
              .drop_duplicates('user_id')[['user_id', 'seats']]
              .reset_index(drop=True)
)

upgrade_df = upgrade_df.merge(seats_before, on='user_id', how='left')
upgrade_df = upgrade_df.drop('active_seats', axis = 1)
billingRaw['plan_tier_shift'] = billingRaw.groupby('user_id')['plan_tier'].shift()
billingRaw['tier_change'] = billingRaw['plan_tier'] != billingRaw['plan_tier_shift']
first_rows_idx = billingRaw.groupby('user_id').head(1).index
billingRaw.loc[first_rows_idx, 'tier_change'] = False

first_change_prev_idx = (
    billingRaw[billingRaw['tier_change']]
      .groupby('user_id')
      .head(1)
      .index - 1
)

billingRaw['support_tickets'] = np.nan
billingRaw.loc[first_change_prev_idx, 'support_tickets'] = billingRaw.loc[first_change_prev_idx, 'support_ticket_count']

no_change_mask = billingRaw.groupby('user_id')['tier_change'].transform('sum').eq(0)
last_rows_no_change = billingRaw[no_change_mask].groupby('user_id').tail(1)
billingRaw.loc[last_rows_no_change.index, 'support_tickets'] = last_rows_no_change['support_ticket_count']

support_tickets_before = (
    billingRaw.dropna(subset=['support_tickets'])
              .drop_duplicates('user_id')[['user_id', 'support_tickets']]
              .reset_index(drop=True)
)
upgrade_df = upgrade_df.merge(support_tickets_before, on='user_id', how='left')
upgrade_df = upgrade_df.drop('support_ticket_count', axis = 1)
activeUpgrades = upgrade_df[upgrade_df['upgrade'] != 0]

## Feature Engineering with events.csv
Note that this is condensed and doesn't show full thought process. Full process on billingEDA.ipynb

In [12]:
eventsRaw = eventsRaw.drop('event_id', axis = 1)
action_counts = eventsRaw.groupby('user_id')['action'].value_counts().unstack(fill_value=0).reset_index()
action_counts['total'] = action_counts[['click', 'complete', 'view']].sum(axis = 1)
df = activeUpgrades
df = df.merge(action_counts, on='user_id', how='left')
df = df.dropna()
df = df.drop(['click', 'complete', 'view'], axis = 1)
df['upgrade'] = df['upgrade'].replace(-1, 0)

## Finished dataset
### upgrade
0 is downgrade, 1 is upgrade

### discount_applied
0 if discount not applied, 1 if discount applied

### seats
number of seats BEFORE upgrading or downgrading

### support_tickets
support tickets the month BEFORE upgrading or downgrading

### total
total actions (click, view, complete)

In [26]:
df

Unnamed: 0,user_id,upgrade,discount_applied,seats,support_tickets,total
0,0031b10d-a3fb-4fa6-a27b-ea1ef8d5131b,0,0,8.0,0.0,4.0
3,00503b30-79b8-4aed-aeb5-a91aca3053be,0,0,117.0,2.0,2.0
5,007bb89e-1b35-4392-84c9-c4248197157b,1,1,13.0,1.0,1.0
6,0081cb57-7415-430e-a867-929319021a16,1,1,9.0,0.0,5.0
8,00cc5c06-3119-4761-b012-2d6e30054d9b,0,0,89.0,4.0,17.0
...,...,...,...,...,...,...
4822,ff5a14b2-b74b-4c55-ab12-302c6869664a,1,1,8.0,0.0,8.0
4823,ff5d04b7-12b0-4b05-967d-62322e6c83c1,1,1,22.0,1.0,1.0
4825,ff844a5d-2fea-4218-9a7c-aee980e53b2d,0,1,21.0,0.0,2.0
4826,ff8bbc6d-6b53-4cfb-89ce-d3eeed8fb238,1,1,1.0,0.0,3.0


## Fitting logistic regression model

In [28]:
x = df.iloc[:, 2:]
y = df.iloc[:, 1]

xtrain, xtest, ytrain, ytest = train_test_split(x, y, stratify=y, random_state = 0)
model = LogisticRegression(solver='lbfgs', max_iter=10000, class_weight='balanced')
model.fit(xtrain, ytrain)
ypred = model.predict(xtest)

print(classification_report(ytest, ypred))

xtrain = sm.add_constant(xtrain)
model = sm.Logit(ytrain, xtrain)
result = model.fit()

print(result.summary())

              precision    recall  f1-score   support

           0       0.70      0.57      0.63       454
           1       0.61      0.73      0.67       412

    accuracy                           0.65       866
   macro avg       0.66      0.65      0.65       866
weighted avg       0.66      0.65      0.65       866

Optimization terminated successfully.
         Current function value: 0.612027
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                upgrade   No. Observations:                 2595
Model:                          Logit   Df Residuals:                     2590
Method:                           MLE   Df Model:                            4
Date:                Sun, 14 Sep 2025   Pseudo R-squ.:                  0.1156
Time:                        23:04:05   Log-Likelihood:                -1588.2
converged:                       True   LL-Null:                       -1795.8
Covariance Type:  