In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

In [15]:
recommendations_df = pd.read_json('./log_files/ab_test_20250117.log', lines=True)
listening_data = pd.read_json('./data_files/test_sessions.jsonl', lines=True)

In [16]:
results = []

In [17]:
for index, recommendation in recommendations_df.iterrows():
    rec_time = pd.to_datetime(recommendation['timestamp'])
    
    group_users = recommendation['user_ids']
    recommended_tracks = recommendation['recommended_tracks']
    
    group_listening = listening_data[
        (listening_data['user_id'].isin(group_users))
    ]
    
    track_stats = {}
    for track in recommended_tracks:
        listeners = group_listening[group_listening['track_id'] == track]['user_id'].nunique()
        track_stats[track] = listeners
    
    result = {
        'recommendation_id': index,
        'timestamp': rec_time,
        'model_type': recommendation['model_type'],
        'group_size': len(group_users),
        
        'avg_listeners': np.mean(list(track_stats.values())),
        
        'tracks_with_listeners': sum(1 for v in track_stats.values() if v > 0),
        
        'tracks_listened_by_half': sum(1 for v in track_stats.values() if v >= len(group_users)/2),
        
        'tracks_listened_by_all': sum(1 for v in track_stats.values() if v == len(group_users))
    }
    
    for user in group_users:
        user_listening = group_listening[group_listening['user_id'] == user]
        listened_tracks = user_listening['track_id'].unique()
        user_listened_count = sum(1 for track in recommended_tracks if track in listened_tracks)
        result[f'user_{user}_listened'] = user_listened_count
    
    results.append(result)

In [18]:
analysis_df = pd.DataFrame(results)

In [19]:
print("\Average statistics model wise:")
display(analysis_df.groupby('model_type')[['avg_listeners', 'tracks_with_listeners', 'tracks_listened_by_half', 'tracks_listened_by_all']].mean())

\Average statistics model wise:


Unnamed: 0_level_0,avg_listeners,tracks_with_listeners,tracks_listened_by_half,tracks_listened_by_all
model_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
advanced,0.770068,10.326531,0.183673,0.0
basic,0.660135,9.341463,0.02439,0.0


Given the above script for analyzing the A/B experiment, it is difficult to properly calculate the proposed success criterion (number of generated playlists listened to for at least 60 minutes / total number of generated playlists > 0.6).

In the current data, we only have information about whether a user listened to a track (through track_id), but we lack crucial information about:
- How long each track was played
- Whether the track was listened to completely or partially
- The total listening time for each playlist

To properly measure the proposed success criterion, we would need to:

### Data Enrichment
The `listening_data` should be enhanced with:
- Playback start timestamp
- Playback end timestamp or session duration
- Length of each track

### Script Modifications
The current script serves only as a brief demonstration of how to read data from log files and extract basic information. To measure the success criterion properly, the script would need additional features:
- Calculation of total listening time for each playlist
- Identification of playlists that reach the 60-minute threshold
- Computation of the final ratio (number of playlists meeting the criterion / total number of playlists)

Without this additional data, we can only estimate recommendation effectiveness based on currently available metrics. Currently, we cannot definitively determine whether the 60-minute listening criterion is met.