In [72]:
import polars as pl
import numpy as np
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import time

class EventHMM:
    """
    Hidden Markov Model for analyzing app event sequences.

    States represent hidden user behavior patterns.
    Observations are the actual events (login, click, purchase, etc.)
    """

    def __init__(self, n_states: int = 3, state_names: Optional[List[str]] = None):
        """
        Initialize HMM model.

        Args:
            n_states: Number of hidden states
            state_names: Optional names for states (e.g., ['browsing', 'engaged', 'converting'])
        """
        self.n_states = n_states
        self.state_names = state_names or [f"State_{i}" for i in range(n_states)]

        # Model parameters
        self.transition_matrix = None  # A[i,j] = P(state_j | state_i)
        self.emission_matrix = None    # B[i,j] = P(event_j | state_i)
        self.initial_probs = None      # π[i] = P(state_i at t=0)

        # Mappings
        self.event_to_idx = {}
        self.idx_to_event = {}
        self.n_events = 0

        # Training data storage
        self.sequences = []
        self.user_sequences = {}

    def _prepare_data(self, df: pl.DataFrame) -> Dict[str, List[int]]:
        """
        Convert DataFrame to sequences of event indices grouped by user.
        """
        # Get unique events and create mappings
        unique_events = df.select("event").unique().to_series().to_list()
        self.event_to_idx = {event: idx for idx, event in enumerate(unique_events)}
        self.idx_to_event = {idx: event for event, idx in self.event_to_idx.items()}
        self.n_events = len(unique_events)

        print(f"Found {self.n_events} unique events: {unique_events}")

        # Group by user and create sequences
        user_sequences = {}

        # Sort by user and timestamp
        df_sorted = df.sort(["userid", "timestamp"])

        # Group by user
        for user_data in df_sorted.group_by("userid"):
            userid = user_data[0]
            events = user_data[1].select("event").to_series().to_list()

            # Convert events to indices
            event_sequence = [self.event_to_idx[event] for event in events]

            # Only keep sequences with more than 1 event
            if len(event_sequence) > 1:
                user_sequences[userid] = event_sequence

        print(f"Prepared sequences for {len(user_sequences)} users")
        return user_sequences

    def _initialize_parameters(self):
        """
        Initialize HMM parameters randomly.
        """
        # Random initialization with normalization
        self.transition_matrix = np.random.rand(self.n_states, self.n_states)
        self.transition_matrix = self.transition_matrix / self.transition_matrix.sum(axis=1, keepdims=True)

        self.emission_matrix = np.random.rand(self.n_states, self.n_events)
        self.emission_matrix = self.emission_matrix / self.emission_matrix.sum(axis=1, keepdims=True)

        self.initial_probs = np.random.rand(self.n_states)
        self.initial_probs = self.initial_probs / self.initial_probs.sum()

        print("Initialized random parameters")

    def _forward_algorithm(self, sequence: List[int]) -> Tuple[np.ndarray, float]:
        """
        Forward algorithm for computing forward probabilities.
        """
        T = len(sequence)
        alpha = np.zeros((T, self.n_states))

        # Initialization
        alpha[0] = self.initial_probs * self.emission_matrix[:, sequence[0]]

        # Forward pass
        for t in range(1, T):
            for j in range(self.n_states):
                alpha[t, j] = np.sum(alpha[t-1] * self.transition_matrix[:, j]) * \
                             self.emission_matrix[j, sequence[t]]

        # Total probability
        log_likelihood = np.log(np.sum(alpha[T-1]) + 1e-10)

        return alpha, log_likelihood

    def _backward_algorithm(self, sequence: List[int]) -> np.ndarray:
        """
        Backward algorithm for computing backward probabilities.
        """
        T = len(sequence)
        beta = np.zeros((T, self.n_states))

        # Initialization
        beta[T-1] = 1.0

        # Backward pass
        for t in range(T-2, -1, -1):
            for i in range(self.n_states):
                beta[t, i] = np.sum(self.transition_matrix[i] *
                                  self.emission_matrix[:, sequence[t+1]] *
                                  beta[t+1])

        return beta

    def _baum_welch_step(self, sequences: List[List[int]]) -> float:
        """
        One step of Baum-Welch algorithm (EM step).
        """
        # Initialize accumulators
        gamma_sum = np.zeros(self.n_states)
        xi_sum = np.zeros((self.n_states, self.n_states))
        gamma_obs_sum = np.zeros((self.n_states, self.n_events))

        total_log_likelihood = 0.0

        for sequence in sequences:
            if len(sequence) < 2:
                continue

            T = len(sequence)

            # Forward-backward
            alpha, log_likelihood = self._forward_algorithm(sequence)
            beta = self._backward_algorithm(sequence)

            total_log_likelihood += log_likelihood

            # Compute gamma (state probabilities)
            gamma = alpha * beta
            gamma = gamma / (np.sum(gamma, axis=1, keepdims=True) + 1e-10)

            # Compute xi (transition probabilities)
            xi = np.zeros((T-1, self.n_states, self.n_states))
            for t in range(T-1):
                for i in range(self.n_states):
                    for j in range(self.n_states):
                        xi[t, i, j] = alpha[t, i] * self.transition_matrix[i, j] * \
                                     self.emission_matrix[j, sequence[t+1]] * beta[t+1, j]

                # Normalize
                xi_sum_t = np.sum(xi[t])
                if xi_sum_t > 0:
                    xi[t] /= xi_sum_t

            # Accumulate statistics
            gamma_sum += np.sum(gamma, axis=0)
            xi_sum += np.sum(xi, axis=0)

            # Emission accumulation
            for t in range(T):
                gamma_obs_sum[:, sequence[t]] += gamma[t]

        # M-step: Update parameters
        # Initial probabilities (from first time step gammas)
        initial_gamma_sum = np.zeros(self.n_states)
        for sequence in sequences:
            if len(sequence) >= 1:
                alpha, _ = self._forward_algorithm(sequence)
                beta = self._backward_algorithm(sequence)
                gamma_0 = alpha[0] * beta[0]
                gamma_0 /= (np.sum(gamma_0) + 1e-10)
                initial_gamma_sum += gamma_0

        self.initial_probs = initial_gamma_sum / (np.sum(initial_gamma_sum) + 1e-10)

        # Transition matrix
        for i in range(self.n_states):
            row_sum = np.sum(xi_sum[i])
            if row_sum > 0:
                self.transition_matrix[i] = xi_sum[i] / row_sum
            else:
                self.transition_matrix[i] = 1.0 / self.n_states

        # Emission matrix
        for i in range(self.n_states):
            row_sum = gamma_sum[i]
            if row_sum > 0:
                self.emission_matrix[i] = gamma_obs_sum[i] / row_sum
            else:
                self.emission_matrix[i] = 1.0 / self.n_events

        return total_log_likelihood

    def fit(self, df: pl.DataFrame, max_iterations: int = 100, tolerance: float = 1e-2):
        """
        Train the HMM using Baum-Welch algorithm.
        """
        print("Starting HMM training...")

        # Prepare data
        self.user_sequences = self._prepare_data(df)
        sequences = list(self.user_sequences.values())

        # Initialize parameters
        self._initialize_parameters()

        # Training loop
        prev_log_likelihood = float('-inf')

        for iteration in range(max_iterations):
            start_time = time.time()
            log_likelihood = self._baum_welch_step(sequences)
            end_time = time.time()
            elapsed = (end_time - start_time)/60
            print(f"Iteration {iteration + 1}: Log-likelihood = {log_likelihood:.4f}, Elapsed time: {elapsed:.2f}")

            # Check convergence
            if abs(log_likelihood - prev_log_likelihood) < tolerance:
                print(f"Converged after {iteration + 1} iterations")
                break

            prev_log_likelihood = log_likelihood

        print("Training completed!")
        self._print_model_summary()

    def predict_sequence(self, sequence: List[str]) -> List[str]:
        """
        Predict most likely state sequence using Viterbi algorithm.
        """
        # Convert events to indices
        obs_sequence = [self.event_to_idx.get(event, 0) for event in sequence]
        T = len(obs_sequence)

        # Viterbi tables
        delta = np.zeros((T, self.n_states))
        psi = np.zeros((T, self.n_states), dtype=int)

        # Initialization
        delta[0] = self.initial_probs * self.emission_matrix[:, obs_sequence[0]]

        # Forward pass
        for t in range(1, T):
            for j in range(self.n_states):
                trans_probs = delta[t-1] * self.transition_matrix[:, j]
                psi[t, j] = np.argmax(trans_probs)
                delta[t, j] = np.max(trans_probs) * self.emission_matrix[j, obs_sequence[t]]

        # Backward pass - find best path
        states = np.zeros(T, dtype=int)
        states[T-1] = np.argmax(delta[T-1])

        for t in range(T-2, -1, -1):
            states[t] = psi[t+1, states[t+1]]

        # Convert to state names
        return [self.state_names[state] for state in states]

    def get_user_behavior_pattern(self, userid: str) -> Optional[List[str]]:
        """
        Get the predicted behavior pattern for a specific user.
        """
        if userid not in self.user_sequences:
            return None

        sequence = self.user_sequences[userid]
        events = [self.idx_to_event[idx] for idx in sequence]
        return self.predict_sequence(events)

    def _print_model_summary(self):
        """
        Print a summary of the learned model.
        """
        print("\n" + "="*50)
        print("HMM MODEL SUMMARY")
        print("="*50)

        print(f"\nNumber of states: {self.n_states}")
        print(f"Number of events: {self.n_events}")

        print(f"\nInitial State Probabilities:")
        for i, prob in enumerate(self.initial_probs):
            print(f"  {self.state_names[i]}: {prob:.3f}")

        print(f"\nTransition Matrix:")
        print("From \\ To    ", end="")
        for state in self.state_names:
            print(f"{state:>10}", end="")
        print()

        for i, state in enumerate(self.state_names):
            print(f"{state:>10}   ", end="")
            for j in range(self.n_states):
                print(f"{self.transition_matrix[i,j]:>10.3f}", end="")
            print()

        print(f"\nEmission Probabilities (Top 10 events per state):")
        for i, state in enumerate(self.state_names):
            print(f"\n{state}:")
            # Get top 3 events for this state
            top_events = np.argsort(self.emission_matrix[i])[-10:][::-1]
            for event_idx in top_events:
                event_name = self.idx_to_event[event_idx]
                prob = self.emission_matrix[i, event_idx]
                print(f"  {event_name}: {prob:.3f}")

    def plot_model(self):
        """
        Visualize the HMM model.
        """
        #fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

        fig = plt.figure(figsize=(15, 12))
        gs = fig.add_gridspec(3, 2)  # 3 sor, 2 oszlop
        
        ax1 = fig.add_subplot(gs[0, 0])
        ax2 = fig.add_subplot(gs[0, 1])
        ax3 = fig.add_subplot(gs[1, :])  # második sor, mindkét oszlopot lefoglalja
        ax4 = fig.add_subplot(gs[2, :])  # harmadik sor, mindkét oszlopot lefoglalja

        # 1. Initial probabilities
        ax1.bar(self.state_names, self.initial_probs)
        ax1.set_title('Initial State Probabilities')
        ax1.set_ylabel('Probability')

        # 2. Transition matrix heatmap
        sns.heatmap(self.transition_matrix,
                   xticklabels=self.state_names,
                   yticklabels=self.state_names,
                   annot=True, fmt='.3f', cmap='Blues', ax=ax2)
        ax2.set_title('State Transition Matrix')

        # 3. Emission matrix heatmap
        event_names = [self.idx_to_event[i] for i in range(self.n_events)]
        sns.heatmap(self.emission_matrix,
                   xticklabels=event_names,
                   yticklabels=self.state_names,
                   annot=True, fmt='.2f', cmap='Greens', ax=ax3)
        ax3.set_title('Event Emission Probabilities')
        ax3.set_xlabel('Events')

        # 4. Event distribution
        event_counts = np.sum(self.emission_matrix, axis=0)
        ax4.bar(event_names, event_counts)
        ax4.set_title('Overall Event Distribution')
        ax4.set_xlabel('Events')
        ax4.set_ylabel('Total Probability Mass')
        ax4.tick_params(axis='x', rotation=45)

        plt.tight_layout()
        plt.show()


# Example usage and testing
def create_sample_data():
    """
    Create sample app event data for testing.
    """
    np.random.seed(42)

    events = ['app_open', 'browse_catalog', 'view_product', 'add_to_cart',
              'checkout_start', 'payment', 'purchase_complete', 'app_close']

    countries = ['US', 'UK', 'DE', 'FR', 'CA']

    data = []

    # Generate realistic user sessions
    for userid in range(100):
        # Each user has 1-5 sessions
        n_sessions = np.random.randint(1, 6)

        for session in range(n_sessions):
            country = np.random.choice(countries)
            base_time = session * 3600 + userid * 86400  # Different days/sessions

            # Generate event sequence based on user behavior pattern
            if userid < 30:  # Browsers - mostly browse, rarely purchase
                session_events = ['app_open', 'browse_catalog'] + \
                               np.random.choice(['browse_catalog', 'view_product'],
                                              size=np.random.randint(2, 8)).tolist() + \
                               ['app_close']
            elif userid < 70:  # Shoppers - browse and sometimes buy
                session_events = ['app_open', 'browse_catalog', 'view_product'] + \
                               np.random.choice(['view_product', 'add_to_cart'],
                                              size=np.random.randint(1, 4)).tolist()
                if np.random.random() < 0.3:  # 30% conversion
                    session_events += ['checkout_start', 'payment', 'purchase_complete']
                session_events += ['app_close']
            else:  # Buyers - focused, high conversion
                session_events = ['app_open', 'browse_catalog', 'view_product',
                                'add_to_cart', 'checkout_start']
                if np.random.random() < 0.8:  # 80% conversion
                    session_events += ['payment', 'purchase_complete']
                session_events += ['app_close']

            # Add events with timestamps
            for i, event in enumerate(session_events):
                data.append({
                    'userid': f'user_{userid:03d}',
                    'country': country,
                    'timestamp': base_time + i * 60,  # 1 minute between events
                    'event': event
                })

    return pl.DataFrame(data)



In [30]:
df = pl.read_parquet("data/full_filtered_df.parquet")

In [31]:
df

event,distinct_id,os_version,country_code,date,time_since_first_event,days,hours,minutes,seconds,groups
str,str,str,str,datetime[μs],duration[μs],i32,i32,i32,i32,str
"""secondOrMoreOpening""","""$device:user_37449502161176""","""18.1.1""","""CH""",2025-02-02 12:04:50,13d 2h 31m 12s,13,314,18871,1132272,"""Second"""
"""Homescreen""","""$device:user_37449502161176""","""18.1.1""","""CH""",2025-02-02 12:04:50,13d 2h 31m 12s,13,314,18871,1132272,"""Homescreen"""
"""checkSubscriptionStatus.Error""","""$device:user_37449502161176""","""18.1.1""","""CH""",2025-02-02 12:04:50,13d 2h 31m 12s,13,314,18871,1132272,"""checkSubscriptionStatus.Error"""
"""Branch.Log.Error""","""$device:user_37449502161176""","""18.1.1""","""CH""",2025-02-02 12:04:50,13d 2h 31m 12s,13,314,18871,1132272,"""branch"""
"""RemoteConfigManager.shared.rem…","""$device:user_37449502161176""","""18.1.1""","""CH""",2025-02-02 12:04:50,13d 2h 31m 12s,13,314,18871,1132272,"""Remote"""
…,…,…,…,…,…,…,…,…,…,…
"""VideoPreviewScreen.exportVideo""","""$device:user_22284728513097""","""18.2.1""","""US""",2025-01-23 11:29:07,1d 12m 36s,1,24,1452,87156,"""VideoPreview"""
"""VideoPreviewScreen.SaveSuccess""","""$device:user_22284728513097""","""18.2.1""","""US""",2025-01-23 11:29:09,1d 12m 38s,1,24,1452,87158,"""VideoPreviewScreen.SaveSuccess"""
"""RatingPromptShownImmediately""","""$device:user_22284728513097""","""18.2.1""","""US""",2025-01-23 11:29:09,1d 12m 38s,1,24,1452,87158,"""Rating"""
"""$ae_session""","""$device:user_22284728513097""","""18.2.1""","""US""",2025-01-23 11:29:19,1d 12m 48s,1,24,1452,87168,"""$ae"""


In [32]:
df2 = df.select([pl.col("groups"), pl.col("distinct_id"), pl.col("country_code"), pl.col("seconds")])

In [33]:
df2 = df2.rename({"distinct_id": "userid"})
df2 = df2.rename({"country_code": "country"})
df2 = df2.rename({"seconds": "timestamp"})
df2 = df2.rename({"groups": "event"})
df2 = df2.select(["userid", "event", "country", "timestamp"])

In [34]:
df2

userid,event,country,timestamp
str,str,str,i32
"""$device:user_37449502161176""","""Second""","""CH""",1132272
"""$device:user_37449502161176""","""Homescreen""","""CH""",1132272
"""$device:user_37449502161176""","""checkSubscriptionStatus.Error""","""CH""",1132272
"""$device:user_37449502161176""","""branch""","""CH""",1132272
"""$device:user_37449502161176""","""Remote""","""CH""",1132272
…,…,…,…
"""$device:user_22284728513097""","""VideoPreview""","""US""",87156
"""$device:user_22284728513097""","""VideoPreviewScreen.SaveSuccess""","""US""",87158
"""$device:user_22284728513097""","""Rating""","""US""",87158
"""$device:user_22284728513097""","""$ae""","""US""",87168


In [80]:
def compute_bic(hmm, data, logL):
    """
    hmm: fitted HMM objektum
    data: a megfigyelések (pl. események sorozata)
    """
    N = hmm.n_states
    M = hmm.n_events
    n = len(data)  # teljes adatméret

    # paraméterszám kiszámítása
    k = (N - 1) + N * (N - 1) + N * (M - 1)

    # BIC képlet
    bic = -2 * logL + k * np.log(n)
    return bic

A HMM eredményei a fő eseményeket is tartalmazó adatokon.

In [64]:
# main events
import pickle
if __name__ == "__main__":
    # Create sample data
    states = 3

    print(df2)

    print(f"Sample data shape: {df2.shape}")
    print(f"Unique users: {df2.select('userid').n_unique()}")
    #print(f"Unique events: {df2.select('event').unique().to_series().to_list()}")
    print(f"Date range: {df2.select('timestamp').min().item()} - {df2.select('timestamp').max().item()}")

    # Initialize and train HMM
    print(f"\nInitializing HMM with {states} states...")
    hmm = EventHMM(n_states=states, state_names=[f"State {i+1}" for i in range(states)])

    # Train the model
    hmm.fit(df2, max_iterations=40)

    # Analyze specific user
    sample_user = list(hmm.user_sequences.keys())[1]
    user_pattern = hmm.get_user_behavior_pattern(sample_user)
    if user_pattern:
        print(f"\nUser {sample_user} behavior pattern:")
        user_events = [hmm.idx_to_event[idx] for idx in hmm.user_sequences[sample_user]]
        for event, state in zip(user_events, user_pattern):
            print(f"{event:>18} -> {state}")

    
    with open(f"{states}_hidden_state_hmm.pkl", "wb") as f:
        pickle.dump(hmm, f)

shape: (4_306_330, 4)
┌─────────────────────────────┬────────────────────────────────┬─────────┬───────────┐
│ userid                      ┆ event                          ┆ country ┆ timestamp │
│ ---                         ┆ ---                            ┆ ---     ┆ ---       │
│ str                         ┆ str                            ┆ str     ┆ i32       │
╞═════════════════════════════╪════════════════════════════════╪═════════╪═══════════╡
│ $device:user_37449502161176 ┆ Second                         ┆ CH      ┆ 1132272   │
│ $device:user_37449502161176 ┆ Homescreen                     ┆ CH      ┆ 1132272   │
│ $device:user_37449502161176 ┆ checkSubscriptionStatus.Error  ┆ CH      ┆ 1132272   │
│ $device:user_37449502161176 ┆ branch                         ┆ CH      ┆ 1132272   │
│ $device:user_37449502161176 ┆ Remote                         ┆ CH      ┆ 1132272   │
│ …                           ┆ …                              ┆ …       ┆ …         │
│ $device:user_222847

KeyboardInterrupt: 

In [55]:
 with open(f"{states}_hidden_state_hmm.pkl", "wb") as f:
        pickle.dump(hmm, f)

A main eventeket is tartalmazó HMM eredmények különböző "state" számokra:

Main events:  
3: Log-likelihood =  -350066.2216  BIC: 707632.76  
4: Log-likelihood =  -350018.4482  
5: Log-likelihood =  -350282.2647   
6: Log-likelihood =   
7: Log-likelihood = -348859.2070

A fenti eredmények figyelembe vételével a 3 darab hidden state-el rendelkező HMM illeszkedett a legjobban. 4 vagy további state számok esetén redundás statek jöttek létre.

In [68]:
hmm._print_model_summary()


HMM MODEL SUMMARY

Number of states: 3
Number of events: 162

Initial State Probabilities:
  State 1: 0.393
  State 2: 0.578
  State 3: 0.028

Transition Matrix:
From \ To       State 1   State 2   State 3
   State 1        0.526     0.315     0.160
   State 2        0.198     0.209     0.593
   State 3        0.104     0.606     0.290

Emission Probabilities (Top 10 events per state):

State 1:
  Record: 0.013
  New: 0.013
  account: 0.012
  Regist: 0.012
  reviewTap: 0.012
  Remote: 0.012
  checkSubscriptionStatus.Error: 0.012
  stop: 0.012
  PastedSpeedAndFontsize: 0.012
  salesSentToAgent: 0.012

State 2:
  subtitle: 0.012
  billing_issue_event: 0.012
  type: 0.012
  FootPedalRemoteScreen: 0.012
  PastedSpeedAndFontsize: 0.012
  userDidTakeScreenshotNotification: 0.012
  config_attributes: 0.012
  Select: 0.012
  purchase_abandoned: 0.012
  update: 0.012

State 3:
  trial_started_event: 0.011
  Preview: 0.011
  TrialScreenSubscribeTap: 0.011
  New: 0.011
  subscription: 0.011
  so

### Rövid összefoglaló:

3 állapot:

State 3: trial_started (kezdeti állapot) — innen indulnak a session-ök (100%).

State 1: döntési állapot — főként trial_cancelled (79.4%) és kisebb arányban trial_converted (19.1%).

State 2: lemorzsolódási / retention probléma állapot — főként expiration (74.6%) és billing_issue (12.1%).

Folyamat: Start (S3) → döntés (S1) vagy rögtön retention problémák (S2). A S1 ↔ S2 között ciklikus visszajárás van.

### Kezdő valószínűségek

Minden session State 3-ból indul (1.000) — tehát az adat szerint minden felhasználói folyamat a trial indítással kezdődik.

Mit csinálnak a felhasználók?

State3 → State1 (0.939): a legtöbb user közvetlenül a döntési állapotba lép a trial start után.

State3 → State2 (0.061): kis rész rögtön retention-problémákhoz jut (pl. lejárat, billing issue) anélkül, hogy azonnal döntést hozna.

State1 → State2 (0.987): akik a döntési állapotban vannak (cancel/convert), gyakorlatilag átmennek a retention-problémás állapotba — vagyis a döntés utáni időszakot a lejárat/billing problémák jellemzik.

State2 → State1 (0.745) és State2 → State2 (0.254): aki a retention-problémás állapotban van, gyakran visszakerül a döntési állapotba (pl. ismételt cancel/convert esemény), de jelentős rész ismételt retention jellegű állapotban marad.

### Emissziók — mit jelentenek az állapotok?

State1: 79.4% trial_cancelled, 19.1% trial_converted → ez a kulcs-döntési állapot; döntésnél a cancel dominál.

State2: 74.6% expiration, 12.1% billing_issue → ez a kockázatos állapot, ahonnan a felhasználó gyakran kiesik.

State3: 99.9%+ trial_started → kezdőállapot.

A korábbi statisztikai és gépi tanulásos módszerekhez hasonlóan a Hidden Markow Model is a fő eseményeket olyan nagy súlyozással veszi figyelembe, hogy a többi esemény súlya szinte teljesen nullázódik. A továbbiakban a fő eseményeket leszűrve fogom vizsgálni a felhasználók eseményeit, hogy jobban látszódjon mely események lehetnek a szűk keresztmetszetek vagy okozhatnak állapot változásokat.

A leszűrt dataframe:

In [77]:
main_events = ["billing_issue_event", "cancellation_event", 
              "expiration_event", "initial_purchase_event", 
              "product_change_event", "renewal_event","trial_cancelled_event",
              "trial_converted_event","trial_started_event","uncancellation_event",
              ]
df_without_main_events = df2.filter(~pl.col("event").is_in(main_events))

In [None]:
minor_event_to_drop = ["user", "session", "Show", "$identify", "session",
                      "Second", "identity_alias", "user",
                       "Thank", "$create_alias", "hideReviewTap", "proUserSignIn",
                       "$ae", "device_attributes", "config_attributes", "Tele", "branch", "Rating",
                       "update", "app", "Regist", "Login", "NoRatingBecauseOfNoSceneError",
                       "SettingsScreen", "Whats", "reviewTap", "checkSubscriptionStatus.Error",
                       "fetchRedeemedPromoCodea.Error", "bluetooth", "open", "paywall", "Welcome",
                       "chooseFacebook", "chooseYoutube", "Word", "SortingTap",
                       "script", "create",
                       "TeleprompterDidLoaded", "remove", "PlayerScreen", "product_change_event",
                       "start", "stop", "add", "Font", "Selected", "TrialScreenCloseTap",
                       "caption", "userDidTakeScreenshotNotification", "deeplinkRemoteConnectUsed",
                       "TrialScreenPurchaseSuccess", "documentPicker.Error", "did_receive_asa_attribution", "setup",
                       "connect", "Select", "assigned", "handle", "getPreviousTranscriptionJob.Error",
                       "Settings", "Manage", "SelectPlatform", "ShowRegistrationFromSettings", "TrialScreenCloseConfirmation",
                       "WCSession*ERROR", "trial_started_event", "errorCatched", "didRegister", "AudioCleaningError"
                      ]

df_without_minor_and_main_events = df_without_main_events.filter(~pl.col("event").is_in(minor_event_to_drop))

In [89]:
if __name__ == "__main__":
    states = 3
    print(f"Sample data shape: {df_without_minor_and_main_events.shape}")
    print(f"Unique users: {df_without_minor_and_main_events.select('userid').n_unique()}")
    print(f"Date range: {df_without_minor_and_main_events.select('timestamp').min().item()} - {df_without_minor_and_main_events.select('timestamp').max().item()}")

    # Initialize and train HMM
    print(f"\nInitializing HMM with {states} states...")
    hmm = EventHMM(n_states=states, state_names=[f"State {i+1}" for i in range(states)])

    # Train the model
    hmm.fit(df_without_minor_and_main_events, max_iterations=30)

    # Analyze specific user
    sample_user = list(hmm.user_sequences.keys())[2]
    user_pattern = hmm.get_user_behavior_pattern(sample_user)
    if user_pattern:
        print(f"\nUser {sample_user} behavior pattern:")
        user_events = [hmm.idx_to_event[idx] for idx in hmm.user_sequences[sample_user]]
        for event, state in zip(user_events, user_pattern):
            print(f"{event:>18} -> {state}")

    
    with open(f"{states}_hidden_state_hmm_without_minor_and_main_events.pkl", "wb") as f:
        pickle.dump(hmm, f)

Sample data shape: (1499786, 4)
Unique users: 14949
Date range: 0 - 5032381

Initializing HMM with 3 states...
Starting HMM training...
Found 86 unique events: ['Shortcut', 'subtitle', 'ShowRegistrationFromProfileSettings', 'PresentationRemoteScreen', 'salesSentToAgent', 'Onboarding', 'videorecorder', 'bionic', 'ConnectFacebook', 'PlayerViewControllerSpeechRecognizing', 'purchase', 'Remote', 'deleteRecording', 'Intercom', 'Speech', 'GameControllerManager.Connected', 'ConnectYouTube', 'OpenScript', 'purchase_abandoned', 'setScriptsToSynced.Error', 'freeTrial_start', 'capture', 'AppStorePromoPurchase', 'TrialExtensionScreen', 'Trim', 'transaction_complete', 'Video', 'Could', 'subscription', 'Live', 'Copied', 'ShowReferralScreen', 'SetLexendFont', 'Rephrase', 'type', 'cancel', 'FootPedalRemoteScreen', 'ConnectAccount', 'account', 'videoRecorderRecordingStarted', 'RecordViewController.startSpeechRecognizing', 'FBApp', 'Preview', 'Folder', 'SetOpenDyslexicFont', 'trigger_fire', 'send', 'Hom

Megszűrt események során az eredmények:  
3 state - Log-likelihood: -343473.7914 - BIC: 690687.66  
4 state - Log-likelihood: -343428.2402 - BIC: 691904  
5 state - Log-likelihood: -343416.0812 - BIC: 693217  
6 state - Log-likelihood: -343460.3509 - BIC: 694671

In [88]:
compute_bic(hmm,df_without_minor_and_main_events , -343460.3509)

694671.0557791293