Split the json files

In [3]:
import json
import random
import os

# Set this to your workspace root
# os.chdir(r"Mining media")
print("Current working directory:", os.getcwd())
def split_jsonl_file(input_path, train_path, test_path, test_ratio=0.2):
    with open(input_path, 'r', encoding='utf-8') as infile, \
         open(train_path, 'w', encoding='utf-8') as train_file, \
         open(test_path, 'w', encoding='utf-8') as test_file:
        
        for line in infile:
            if random.random() < test_ratio:
                test_file.write(line)
            else:
                train_file.write(line)

# Example
split_jsonl_file("data/game2_processed/playerLogs_game2_playerbasedlines.jsonl", "data/game2_processed/train.jsonl", "data/game2_processed/test.jsonl", test_ratio=0.2)


Current working directory: c:\Users\rawan\OneDrive\Desktop\Media Mining


Extract features and prepare model dataset

We only extracted 10000 lines for training and 5000 lines for testing (because my laptop couldnt handle more)



In [10]:
import json
import csv
from datetime import datetime, timedelta
import numpy as np
import pandas as pd

def parse_datetime(dt_str):
    """
    Convert ISO format datetime string to datetime object.
    
    Args:
        dt_str (str): Datetime string in ISO format with 'Z' timezone marker
        
    Returns:
        datetime: Parsed datetime object with UTC timezone
    """
    return datetime.fromisoformat(dt_str.replace("Z", "+00:00"))

def extract_features(events, uid):
    """
    Extract features from a user's event history for churn prediction.
    
    The function splits the data into two periods:
    1. Observation period: First 5 days of user activity
    2. Prediction period: Following 10 days
    
    Args:
        events (list): List of event dictionaries containing user activity data
        uid (str): User identifier
        
    Returns:
        dict: Dictionary containing extracted features, or None if no valid events
        
    Features extracted:
        - play_count: Total number of events in observation period
        - active_days: Number of unique days with activity
        - total_reward: Sum of all rewards earned
        - mean_reward: Average reward per event
        - reward_std: Standard deviation of rewards
        - days_since_last_play: Days between last activity and observation end
        - avg_session_gap: Average hours between consecutive events
        - last_7_days_activity: Binary indicator of activity in last week
        - churn: Binary indicator of user churning in prediction period
    """
    if not events:
        return None

    # Sort events chronologically and define time windows
    events = sorted(events, key=lambda x: parse_datetime(x['date']))
    start_time = parse_datetime(events[0]['date'])
    obs_end = start_time + timedelta(days=5)    # End of observation period
    pred_end = obs_end + timedelta(days=10)     # End of prediction period

    # Split events into observation and prediction periods
    obs_events = [e for e in events if parse_datetime(e['date']) < obs_end]
    pred_events = [e for e in events if obs_end <= parse_datetime(e['date']) < pred_end]

    if not obs_events:
        return None

    # Extract basic event information
    times = [parse_datetime(e['date']) for e in obs_events]
    actions = [e.get('event', '') for e in obs_events]
    rewards = [e.get('reward', 0) for e in obs_events]

    # Calculate activity patterns
    active_days = {t.date() for t in times}  # Set of unique active days
    # Calculate time gaps between consecutive events (in hours)
    gaps = [(times[i+1] - times[i]).total_seconds() / 3600 for i in range(len(times)-1)]

    # Construct feature dictionary
    return {
        'uid': uid,
        'play_count': len(obs_events),          # Total number of events
        'active_days': len(active_days),        # Number of unique active days
        'mean_score': np.mean(rewards),        # Average reward per event
        'score_std': np.std(rewards), 
        'best_score': max(rewards),         # Reward variability
        'days_since_last_play': (obs_end - max(times)).days,  # Recency of last activity
        'avg_session_gap': np.mean(gaps) if gaps else 0,      # Average time between events
        'last_7_days_activity': int(any((obs_end - t).days <= 7 for t in times)),  # Recent activity indicator
        'churn': (len(pred_events) == 0)     # Churn label (1 if no activity in prediction period)
    }

## We only extracted 10000 lines for training
def stream_features_to_csv_train(jsonl_path, output_csv):
    """
    Process a JSONL file of user events and write extracted features to CSV.
    
    This function processes the input file line by line to handle large files
    efficiently. For each user, it flattens their event records and extracts
    features for churn prediction.
    
    Args:
        jsonl_path (str): Path to input JSONL file containing user events
        output_csv (str): Path where the output CSV file will be written
        
    Side effects:
        - Creates or overwrites the output CSV file
        - Prints progress messages every 10,000 lines
        - Prints error messages for skipped lines
    """
    with open(jsonl_path, 'r', encoding='utf-8') as infile, \
         open(output_csv, 'w', encoding='utf-8', newline='') as outfile:

        writer = None
        for i, line in enumerate(infile):
            try:
                # Parse each line as a user record
                row = json.loads(line)
                uid = row.get('uid')
                records = row.get('records', [])
                
                # Flatten and normalize event records
                flat_events = []
                for event in records:
                    props = event.get('properties') or {}
                    flat_events.append({
                        'date': event.get('date'),
                        'event': event.get('event', ''),
                        # Extract numeric properties with safe fallbacks
                        'reward': props.get('reward', 0) if isinstance(props, dict) else 0,
                        'package': props.get('package', 0) if isinstance(props, dict) else 0
                    })

                # Extract features and write to CSV
                features = extract_features(flat_events, uid)
                if features:
                    # Initialize CSV writer with headers on first valid record
                    if writer is None:
                        writer = csv.DictWriter(outfile, fieldnames=list(features.keys()))
                        writer.writeheader()
                    writer.writerow(features)

                # Print progress update every 10,000 records
                if (i + 1) % 10000 == 0:
                    print(f"Processed {i+1} lines...")

                if (i + 1) % 20000 == 0:
                    break

            except Exception as e:
                print(f"Skipping line {i} due to error: {e}")
                continue

    print(f"✅ Finished writing features to {output_csv}")

In [27]:
##We only extracted 5000 lines for testing
def stream_features_to_csv_test(jsonl_path, output_csv):
    """
    Process a JSONL file of user events and write extracted features to CSV.
    
    This function processes the input file line by line to handle large files
    efficiently. For each user, it flattens their event records and extracts
    features for churn prediction.
    
    Args:
        jsonl_path (str): Path to input JSONL file containing user events
        output_csv (str): Path where the output CSV file will be written
        
    Side effects:
        - Creates or overwrites the output CSV file
        - Prints progress messages every 10,000 lines
        - Prints error messages for skipped lines
    """
    with open(jsonl_path, 'r', encoding='utf-8') as infile, \
         open(output_csv, 'w', encoding='utf-8', newline='') as outfile:

        writer = None
        for i, line in enumerate(infile):
            try:
                # Parse each line as a user record
                row = json.loads(line)
                uid = row.get('uid')
                records = row.get('records', [])
                
                # Flatten and normalize event records
                flat_events = []
                for event in records:
                    props = event.get('properties') or {}
                    flat_events.append({
                        'date': event.get('date'),
                        'event': event.get('event', ''),
                        # Extract numeric properties with safe fallbacks
                        'reward': props.get('reward', 0) if isinstance(props, dict) else 0,
                        'package': props.get('package', 0) if isinstance(props, dict) else 0
                    })

                # Extract features and write to CSV
                features = extract_features(flat_events, uid)
                if features:
                    # Initialize CSV writer with headers on first valid record
                    if writer is None:
                        writer = csv.DictWriter(outfile, fieldnames=list(features.keys()))
                        writer.writeheader()
                    writer.writerow(features)

                # Print progress update every 10,000 records
                if (i + 1) % 10000 == 0:
                    print(f"Processed {i+1} lines...")

                if (i + 1) % 5000 == 0:
                    break

            except Exception as e:
                print(f"Skipping line {i} due to error: {e}")
                continue

    print(f"✅ Finished writing features to {output_csv}")

In [28]:
stream_features_to_csv_train("data/game2_processed/train.jsonl", "data/game2_processed/train_features.csv")
stream_features_to_csv_test("data/game2_processed/test.jsonl", "data/game2_processed/test_features.csv")

Processed 10000 lines...
Processed 20000 lines...
✅ Finished writing features to data/game2_processed/train_features.csv


In [None]:
ds1 = pd.read_csv("data/game2_processed/train_features.csv")
ds2 = pd.read_csv("data/game2_processed/test_features.csv")

<class 'pandas.core.frame.DataFrame'>


In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

def prepare_model_data(ds1, ds2):
    """Prepare features and labels for modeling"""
    # Features (exclude device ID and churn label)
    feature_cols = [col for col in ds1.columns if col not in ['uid', 'churn']]
    
    # DS1 (Train)
    X_train = ds1[feature_cols]
    y_train = ds1['churn']
    
    # DS2 (Test) 
    X_test = ds2[feature_cols]
    
    # Impute missing values (using train stats)
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    
    # Scale features (using train stats)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, feature_cols

def train_models(X_train, y_train):
    """Train multiple classifiers"""
    models = {
        'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
    }
    
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
    
    return models

def evaluate_models(models, X_test, ds2, feature_cols):
    """Evaluate models on test data"""
    results = {}
    
    for name, model in models.items():
        # Get predicted probabilities
        y_proba = model.predict_proba(X_test)[:, 1]
        ds2[f'{name}_churn_prob'] = y_proba
        
        # Feature importance (for tree models)
        if hasattr(model, 'feature_importances_'):
            importance = pd.Series(model.feature_importances_, index=feature_cols)
            print(f"\n{name} Feature Importance:")
            print(importance.sort_values(ascending=False).head(10))
        
        results[name] = model
    
    # Save test results with predictions
    ds2.to_csv("data/game2_processed/ds2_with_predictions.csv", index=False)
    print("\nSaved test predictions: ../data/game2_processed/ds2_with_predictions.csv")
    
    return results

# Prepare data
X_train, X_test, y_train, feature_cols = prepare_model_data(ds1, ds2)

# Train models
models = train_models(X_train, y_train)

# Evaluate on DS2
model_results = evaluate_models(models, X_test, ds2, feature_cols)

Training Decision Tree...
Training Random Forest...
Training Logistic Regression...

Decision Tree Feature Importance:
days_since_last_play    0.671280
play_count              0.228508
active_days             0.034674
best_score              0.033863
avg_session_gap         0.022337
score_std               0.005749
mean_score              0.003589
last_7_days_activity    0.000000
dtype: float64

Random Forest Feature Importance:
days_since_last_play    0.305510
active_days             0.273786
play_count              0.181291
avg_session_gap         0.098779
best_score              0.060983
score_std               0.044240
mean_score              0.035409
last_7_days_activity    0.000000
dtype: float64

Saved test predictions: ../data/game2_processed/ds2_with_predictions.csv


In [33]:
# Save datasets in JSONL format
import jsonlines

def save_to_jsonl(df, path):
    with jsonlines.open(path, 'w') as writer:
        writer.write_all(df.to_dict('records'))

# Save DS1 (training data)
save_to_jsonl(ds1, "data/game2_processed/ds1_train.jsonl")
print("Saved DS1 (training) as JSONL: ../data/game2_processed/ds1_train.jsonl")

# Save DS2 (test data) - without churn labels if they don't exist
if 'churned' in ds2.columns:
    save_to_jsonl(ds2.drop(columns=['churned']), "data/game2_processed/ds2_test.jsonl")
else:
    save_to_jsonl(ds2, "data/game2_processed/ds2_test.jsonl")
print("Saved DS2 (test) as JSONL: ../data/game2_processed/ds2_test.jsonl")

Saved DS1 (training) as JSONL: ../data/game2_processed/ds1_train.jsonl
Saved DS2 (test) as JSONL: ../data/game2_processed/ds2_test.jsonl


Load GPT-2 model and tokenizer

In [5]:


from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_id = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_id)


print(f"Model '{model_id}' and tokenizer loaded successfully with pad token set.")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model 'gpt2' and tokenizer loaded successfully with pad token set.


Generate prompts for LLM churn prediction

In [22]:


prompts = []

for index, user_data in ds2.iterrows():
    user_id = user_data['uid']
    play_count = user_data.get('play_count', 'N/A')
    active_days = user_data.get('active_days', 'N/A')
    days_since_last_play = user_data.get('days_since_last_play', 'N/A')
    mean_score = user_data.get('mean_score', 'N/A')
    best_score = user_data.get('best_score', 'N/A')
    avg_session_gap = user_data.get('avg_session_gap', 'N/A')

    prompt = (
        f"User ID: {user_id}\n"
        f"Play Count: {play_count}\n"
        f"Active Days: {active_days}\n"
        f"Days Since Last Play: {days_since_last_play}\n"
        f"Average Score: {mean_score:.2f}\n"
        f"Best Score: {best_score}\n"
        f"Session Consistency: {avg_session_gap:.2f} hours between games\n\n"
        "Based on this user's gaming patterns, classify if they will churn (stop playing) in the next period.\n"
        "Respond with EXACTLY one word - either 'Churn' or 'NotChurn'."
    )
    prompts.append(prompt)

print(f"Generated {len(prompts)} prompts.")

Generated 5000 prompts.


Generate LLM churn predictions for a subset of users 

Note: for testing we only extracted 500 prompts

to change this please alter this line "for i, prompt in enumerate(prompts[:100]):"

with "for i, prompt in enumerate(prompts):

In [32]:

# First, add the analyze_prediction function at the top of your notebook or before the prediction loop
def analyze_prediction(generated_text, user_data):
    generated_text_lower = generated_text.lower()
    
    # Direct response check
    if generated_text_lower.strip() in ["churn", "notchurn"]:
        return "Churn" if generated_text_lower.strip() == "churn" else "Not Churn"
    
    # Confidence-based analysis
    churn_indicators = ["churn", "stop", "leave", "quit", "abandon"]
    retention_indicators = ["stay", "continue", "engage", "return", "active", "not churn", "notchurn"]
    
    churn_score = sum(1 for word in churn_indicators if word in generated_text_lower)
    retention_score = sum(1 for word in retention_indicators if word in generated_text_lower)
    
    # Add behavioral weights
    if user_data['days_since_last_play'] >= 4:
        churn_score += 1
    if user_data['last_7day_activity'] <= 1:
        churn_score += 1
    if user_data['play_count'] > 10:
        retention_score += 1
    
    return "Churn" if churn_score > retention_score else "Not Churn"

predictions = {}

print(f"Generating predictions for {len(prompts)} users...")

for i, prompt in enumerate(prompts[:500]):
    try:
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)

        if torch.cuda.is_available():
            inputs = {k: v.to('cuda') for k, v in inputs.items()}

        output_sequences = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=10,  # Reduced since we only need a short response
            num_return_sequences=1,
            temperature=0.3,    # Lower temperature for more focused predictions
            do_sample=True,
            top_p=0.9,         # Add nucleus sampling
            top_k=50,          # Add top-k sampling
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2  # Prevent repetitive text
        )

        generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

        # Extract user_id and get user data
        user_id = prompt.split('\n')[0].replace("User ID: ", "").strip()
        
        # Get user data from the prompt
        user_data = {
            'play_count': int(prompt.split('\n')[1].replace("Play Count: ", "").strip()),
            'active_days': int(prompt.split('\n')[2].replace("Active Days: ", "").strip()),
            'days_since_last_play': int(prompt.split('\n')[3].replace("Days Since Last Play: ", "").strip()),
            'last_7day_activity': int(prompt.split('\n')[1].replace("Play Count: ", "").strip())  # Using play_count as a proxy if not available
        }

        # Use the new prediction analysis
        prediction = analyze_prediction(generated_text, user_data)
        
        predictions[user_id] = prediction
        if (i + 1) % 50 == 0:
            print(f"Processed {i+1} lines...")


    except Exception as e:
        print(f"Error processing prompt {i}: {e}")
        try:
            user_id = prompt.split('\n')[0].replace("User ID: ", "").strip()
            predictions[user_id] = "Error"
        except:
            pass

print("\nPrediction generation complete.")
print(f"Generated predictions for {len(predictions)} users.")

Generating predictions for 5000 users...
Processed 50 lines...
Processed 100 lines...
Processed 150 lines...
Processed 200 lines...
Processed 250 lines...
Processed 300 lines...
Processed 350 lines...
Processed 400 lines...
Processed 450 lines...
Processed 500 lines...

Prediction generation complete.
Generated predictions for 500 users.


In [33]:
print(predictions)

{'0007D808995A3A3F9A061A185D0652DC854D7883': 'Not Churn', '001027BFD879422D2F6274979EF168C89BD5D079': 'Churn', '0013E33812926190BBD0FFA29917C797CA20721C': 'Not Churn', '00158208B67E7A227CC59E9FD3585C517C56E2CC': 'Churn', '0024610E1D72A3990D1F6FAC362C2DF11DF6164E': 'Not Churn', '002711381E686D2B35513BF959AF844FECB15B6F': 'Not Churn', '003B0A44E393B8C83D047AB2B4C908929862F2AF': 'Not Churn', '00448EB3FA9DC0ACD75842246BC6E2DFC9A21B99': 'Not Churn', '006A50EEC9BAEA0ACF020A72492207658DB8EB0D': 'Not Churn', '00A235963BB1F98F59CD4766B732EA874A824F19': 'Not Churn', '00E9D46AD6158CB1309ABC8B474AEACAD4E54E41': 'Not Churn', '015935F89271D4667822DE51BED57363A66AE2E0': 'Not Churn', '0172E50731BC759EE6BCBE4C3FEC16FCC7E845E9': 'Not Churn', '0190E5BA8BC63BDDF98873B7CEA208FCF994331C': 'Not Churn', '0199CF16165723FC6C27B7FF919AC2EEB70C5A54': 'Not Churn', '01A2DE223289C285F9A4224D997D7689DF208D70': 'Not Churn', '01A5A215AAC1141E03ACB26BE8E6B7F528D43B0B': 'Not Churn', '01B9CD994711FE21BB766533572C177FB9BA0

 Evaluate LLM churn predictions

In [34]:


from sklearn.metrics import classification_report
import pandas as pd

processed_user_ids = list(predictions.keys())

actual_labels_df = ds2[ds2['uid'].astype(str).isin(processed_user_ids)].copy()
actual_labels_df['actual_churn_label'] = actual_labels_df['churn'].apply(lambda x: 'Churn' if x else 'Not Churn')

actual_labels = []
predicted_labels = []

for index, row in actual_labels_df.iterrows():
    user_id = str(row['uid'])
    actual_label = row['actual_churn_label']

    predicted_label = predictions.get(user_id, "Unknown")

    actual_labels.append(actual_label)
    predicted_labels.append(predicted_label)

if not actual_labels:
    print("No users processed for evaluation.")
else:
    print("\nEvaluating LLM Churn Predictions:")

    all_possible_labels = ['Churn', 'Not Churn'] + list(set(predicted_labels) - {'Churn', 'Not Churn'})

    print(classification_report(actual_labels, predicted_labels, labels=['Churn', 'Not Churn'], zero_division=0))


Evaluating LLM Churn Predictions:
              precision    recall  f1-score   support

       Churn       0.80      0.15      0.25       386
   Not Churn       0.23      0.88      0.37       114

    accuracy                           0.31       500
   macro avg       0.52      0.51      0.31       500
weighted avg       0.67      0.31      0.27       500

