In [1]:
import pandas as pd

sessions_train_df = pd.read_json('data_files/train_sessions.jsonl', lines=True)
sessions_val_df = pd.read_json('data_files/val_sessions.jsonl', lines=True)

sessions_train_not_agg_df = pd.read_json('data_files/train_sessions_not_agg.jsonl', lines=True)
sessions_val_not_agg_df = pd.read_json('data_files/val_sessions_not_agg.jsonl', lines=True)

In [96]:
val_interactions = sessions_val_not_agg_df[
    (sessions_val_not_agg_df['event_type'] == 'play') | 
    (sessions_val_not_agg_df['event_type'] == 'like')
].groupby(['user_id', 'track_id']).size()
val_interactions = val_interactions.reset_index(name='event_count')
val_filtered = val_interactions[val_interactions['event_count'] > 0.9]

val_filtered = val_filtered[val_filtered['user_id'] < 500]

train_interactions = sessions_train_not_agg_df[
    (sessions_train_not_agg_df['event_type'] == 'play') | 
    (sessions_train_not_agg_df['event_type'] == 'like')
].groupby(['user_id', 'track_id']).size()
train_interactions = train_interactions.reset_index(name='event_count')
train_interactions = train_interactions[train_interactions['user_id'] < 500]

merged_df = val_filtered.merge(
    train_interactions, 
    on=['user_id', 'track_id'], 
    how='right',
    suffixes=('_val', '_train')
)

print(merged_df[merged_df['user_id'] == 109])

       user_id  track_id  event_count_val  event_count_train
35719      109        12              NaN                  2
35720      109        31              NaN                  2
35721      109        34              NaN                  2
35722      109        45              NaN                  1
35723      109        52              NaN                  1
...        ...       ...              ...                ...
36576      109     12233              NaN                  2
36577      109     12266              NaN                  2
36578      109     12270              NaN                  1
36579      109     12281              NaN                  1
36580      109     12289              NaN                  1

[862 rows x 4 columns]


In [17]:
train_user_ids = set(sessions_train_not_agg_df['user_id'])
val_user_ids = set(sessions_val_not_agg_df['user_id'])

train_track_ids = set(sessions_train_not_agg_df['track_id'])
val_track_ids = set(sessions_val_not_agg_df['track_id'])

In [None]:
unique_to_val = val_combinations - train_combinations

print(f"Training set combinations: {len(train_combinations)}")
print(f"Validation set combinations: {len(val_combinations)}")
print(f"Combinations unique to validation set: {len(unique_to_val)}")
print(269511/357311)

In [None]:
sessions_train_not_agg_df.groupby('user_id').size().reset_index(name='count').sort_values('count', ascending=False)

In [None]:
pd.set_option('display.max_rows', 100)
sessions_train_not_agg_df.groupby('user_id').size().reset_index(name='count').sort_values('count', ascending=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

counts = sessions_train_not_agg_df.groupby('user_id').size()
plt.figure(figsize=(10, 6))
plt.hist(counts.values, bins=50)
plt.xlabel('Liczba interakcji użytkownika')
plt.ylabel('Liczba użytkowników')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
heatmap_data = sessions_train_df.pivot_table(
    values='score',
    index='user_id',
    columns='track_id',
    aggfunc='mean',
    fill_value=0
)

plt.figure(figsize=(15, 10))
plt.pcolormesh(heatmap_data, cmap='viridis')
plt.colorbar(label='Score')
plt.title('Heatmap of User-Track Scores')
plt.xlabel('Track ID')
plt.ylabel('User ID')

plt.xticks(np.arange(0, len(heatmap_data.columns), 100), 
          heatmap_data.columns[::100], 
          rotation=45)
plt.yticks(np.arange(0, len(heatmap_data.index), 100), 
          heatmap_data.index[::100])

plt.tight_layout()

plt.show()

In [None]:
heatmap_data = sessions_val_df.pivot_table(
    values='score',
    index='user_id',
    columns='track_id',
    aggfunc='mean',
    fill_value=0
)

plt.figure(figsize=(15, 10))
plt.pcolormesh(heatmap_data, cmap='viridis')
plt.colorbar(label='Score')
plt.title('Heatmap of User-Track Scores')
plt.xlabel('Track ID')
plt.ylabel('User ID')

plt.xticks(np.arange(0, len(heatmap_data.columns), 100), 
          heatmap_data.columns[::100], 
          rotation=45)
plt.yticks(np.arange(0, len(heatmap_data.index), 100), 
          heatmap_data.index[::100])

plt.tight_layout()

plt.show()