In [17]:
import sagemaker
import boto3
import pandas as pd
import numpy as np
from io import StringIO
import ast
import gc
from tqdm import tqdm

In [2]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)
s3 = boto3.Session().client(service_name="s3", region_name=region)

In [3]:
bucket_name = 'ads-508-final'
file_key = "transformeddata/full_ps_data.csv"

In [4]:
response = s3.get_object(Bucket=bucket_name, Key=file_key)
csv_content = response['Body'].read().decode('utf-8')
csv_file = StringIO(csv_content)
df = pd.read_csv(csv_file)

In [12]:
df.head()

Unnamed: 0,playerid,gameid,platform,developers,genres,supported_languages,unique_achievements,days_played,purchased
0,268071,14972.0,PS4,['arkane studios'],['Action'],"['Japanese', 'French', 'Spanish', 'German', 'I...",0.0,0.0,1
1,268071,417905.0,PS5,['Sumo Digital'],['Platformer'],"['Japanese', 'French', 'Spanish', 'German', 'I...",0.0,0.0,1
2,268071,14693.0,PS4,['square enix montreal'],['RPG'],"['Japanese', 'French', 'Spanish', 'German', 'I...",0.0,0.0,1
3,268071,20162.0,PS4,['Ubisoft Montreal Studio'],['RPG'],"['Japanese', 'French', 'Spanish', 'German', 'I...",0.0,0.0,1
4,268071,461983.0,PS5,['Ubisoft Annecy'],"['Sports', 'Cycling', 'Skiing', 'Snowboarding']","['Japanese', 'French', 'Spanish', 'German', 'I...",0.0,0.0,1


In [9]:
df.dtypes

playerid                 int64
gameid                 float64
platform                object
developers              object
genres                  object
supported_languages     object
unique_achievements    float64
days_played            float64
dtype: object

In [10]:
df["playerid"] = df["playerid"].astype("category")
df["gameid"] = df["gameid"].astype("category")

In [11]:
df['purchased'] = 1

In [13]:
df_fm = df[['playerid', 'gameid', 'purchased']]

In [14]:
del df
gc.collect()

11343

### Generate negative samples
FMs require negative samples to be included in the dataset for proper training. These interactions were not included in the initial preproccessing steps so they are added here to adjust. 

In [15]:
all_games = df_fm["gameid"].unique()

def generate_negative_samples(df, num_negatives=5):
    negative_samples = []
    
    for player in tqdm(df["playerid"].unique(), desc="Generating negative samples"):
        played_games = set(df[df["playerid"] == player]["gameid"])
        unplayed_games = list(set(all_games) - played_games)

        # Randomly select games the player has NOT played
        sampled_games = np.random.choice(unplayed_games, min(num_negatives, len(unplayed_games)), replace=False)

        for game in sampled_games:
            negative_samples.append([player, game, 0])  # 0 = Not played

    return pd.DataFrame(negative_samples, columns=["playerid", "gameid", "label"])

In [None]:
# Generate negative samples
neg_samples = generate_negative_samples(df_fm)

# Add positive samples with label = 1
df_final = pd.concat(df_fm, neg_samples)

df_final.head()

In [None]:
# Create binary target: played/purchased = 1, else = 0
df["LABEL"] = (df["PLAYTIME"] > 0) | (df["PURCHASED"] == 1)
df["LABEL"] = df["LABEL"].astype(int)

# Encode USER_ID and GAME_ID as numerical indices
df["USER_IDX"] = df["USER_ID"].astype("category").cat.codes
df["GAME_IDX"] = df["GAME_ID"].astype("category").cat.codes

# Selecting only necessary columns
df = df[["LABEL", "USER_IDX", "GAME_IDX"]]
df.head()