In [22]:
import sagemaker
import boto3
import pandas as pd
import numpy as np
from io import StringIO
import ast
import gc
from tqdm import tqdm
from sklearn.model_selection import train_test_split


In [2]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)
s3 = boto3.Session().client(service_name="s3", region_name=region)

In [3]:
bucket_name = 'ads-508-final'
file_key = "transformeddata/full_ps_data.csv"

In [4]:
response = s3.get_object(Bucket=bucket_name, Key=file_key)
csv_content = response['Body'].read().decode('utf-8')
csv_file = StringIO(csv_content)
df = pd.read_csv(csv_file)

In [5]:
df.head()

Unnamed: 0,playerid,gameid,platform,developers,genres,supported_languages,unique_achievements,days_played
0,268071,14972.0,PS4,['arkane studios'],['Action'],"['Japanese', 'French', 'Spanish', 'German', 'I...",0.0,0.0
1,268071,417905.0,PS5,['Sumo Digital'],['Platformer'],"['Japanese', 'French', 'Spanish', 'German', 'I...",0.0,0.0
2,268071,14693.0,PS4,['square enix montreal'],['RPG'],"['Japanese', 'French', 'Spanish', 'German', 'I...",0.0,0.0
3,268071,20162.0,PS4,['Ubisoft Montreal Studio'],['RPG'],"['Japanese', 'French', 'Spanish', 'German', 'I...",0.0,0.0
4,268071,461983.0,PS5,['Ubisoft Annecy'],"['Sports', 'Cycling', 'Skiing', 'Snowboarding']","['Japanese', 'French', 'Spanish', 'German', 'I...",0.0,0.0


In [6]:
df.dtypes

playerid                 int64
gameid                 float64
platform                object
developers              object
genres                  object
supported_languages     object
unique_achievements    float64
days_played            float64
dtype: object

In [7]:
df["playerid"] = df["playerid"].astype("category")
df["gameid"] = df["gameid"].astype("int")
df["gameid"] = df["gameid"].astype("category")

In [8]:
df['purchased'] = 1

In [9]:
df_fm = df[['playerid', 'gameid', 'purchased']]

In [10]:
del df
gc.collect()

46203

### Generate negative samples
FMs require negative samples to be included in the dataset for proper training. These interactions were not included in the initial preproccessing steps so they are added here to adjust. 

In [11]:
all_games = df_fm["gameid"].unique()

def generate_negative_samples(df, num_negatives=5):
    negative_samples = []
    
    for player in tqdm(df["playerid"].unique(), desc="Generating negative samples"):
        played_games = set(df[df["playerid"] == player]["gameid"])
        unplayed_games = list(set(all_games) - played_games)

        # Randomly select games the player has NOT played
        sampled_games = np.random.choice(unplayed_games, min(num_negatives, len(unplayed_games)), replace=False)

        for game in sampled_games:
            negative_samples.append([player, game, 0])  # 0 = Not played

    return pd.DataFrame(negative_samples, columns=["playerid", "gameid", "label"])

In [12]:
# Generate negative samples
neg_samples = generate_negative_samples(df_fm)

Generating negative samples: 100%|██████████| 46569/46569 [10:51<00:00, 71.52it/s]


In [20]:
neg_samples.rename(columns={"label": "purchased"}, inplace = True)

In [22]:
df_final = pd.concat([df_fm, neg_samples])

Unnamed: 0,playerid,gameid,purchased
0,268071,14972.0,1
1,268071,417905.0,1
2,268071,14693.0,1
3,268071,20162.0,1
4,268071,461983.0,1


In [24]:
df_final["gameid"] = df_final["gameid"].astype("int")

In [26]:
df_final.sample(5)

Unnamed: 0,playerid,gameid,purchased
8623594,3499445,5648,1
5628943,2968440,465487,1
5663913,4482757,369738,1
1141011,327101,5505,1
6916913,328895,12317,1


#### Save progress...

In [27]:
csv_buffer = StringIO()
df_final.to_csv(csv_buffer, index=False)

s3.put_object(Bucket='ads-508-final', Key='transformeddata/sm_fm.csv', Body=csv_buffer.getvalue())

print("Object saved!")

Object saved!


### Load in saved csv

In [3]:
bucket_name = 'ads-508-final'
file_key = "transformeddata/sm_fm.csv"

In [4]:
response = s3.get_object(Bucket=bucket_name, Key=file_key)
csv_content = response['Body'].read().decode('utf-8')
csv_file = StringIO(csv_content)
df = pd.read_csv(csv_file)

In [8]:
df.head()

Unnamed: 0,playerid,gameid,purchased
0,268071,14972,1
1,268071,417905,1
2,268071,14693,1
3,268071,20162,1
4,268071,461983,1


#### Prepare data for further processing
Change the data types then convert to LibSVM format needed for Factorization Machines on SageMaker such that features are represented as as {feature_index}:{value} pairs. 

In [9]:
df["purchased"] = df["purchased"].astype(int)

df["user_idx"] = df["playerid"].astype("category").cat.codes
df["game_idx"] = df["gameid"].astype("category").cat.codes

In [14]:
df_fm = df[["user_idx", "game_idx", "purchased"]]

#### Function to covert data to libsvm

In [15]:
def convert_to_libsvm(df, feature_columns, label_column):
    libsvm_lines = []
    
    print(f"Converting {len(df)} rows to LIBSVM format...")
    
    for idx, row in enumerate(df.itertuples(index=False)):
        label = int(getattr(row, label_column))
        features = " ".join([f"{i}:{getattr(row, col)}" for i, col in enumerate(feature_columns, 1)])
        libsvm_line = f"{label} {features}"
        
        if idx % 100000 == 0:
            print(f"Processed {idx} rows... Sample: {libsvm_line}")
        
        libsvm_lines.append(libsvm_line)
        
    return "\n".join(libsvm_lines)

In [23]:
train_df, test_df = train_test_split(df_fm, test_size=0.2, random_state=42)

In [None]:
feature_cols = ["user_idx", "game_idx"]

train_libsvm = convert_to_libsvm(train_df, feature_cols, "purchased")
test_libsvm = convert_to_libsvm(test_df, feature_cols, "purchased")

In [27]:
s3.put_object(Bucket='ads-508-final', Key="transformeddata/sm_fm_train.libsvm", Body=train_libsvm)
s3.put_object(Bucket='ads-508-final', Key="transformeddata/sm_fm_test.libsvm", Body=test_libsvm) 

{'ResponseMetadata': {'RequestId': 'S1AHJF157NPGEQ7C',
  'HostId': 'sIwg8n0vOsK99Qlaa1/TMVkgxXx/H2l8JcitucQVXJva1IpSEG9nVcJ9qamyRuqx3sWeyzWRnUA=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'sIwg8n0vOsK99Qlaa1/TMVkgxXx/H2l8JcitucQVXJva1IpSEG9nVcJ9qamyRuqx3sWeyzWRnUA=',
   'x-amz-request-id': 'S1AHJF157NPGEQ7C',
   'date': 'Tue, 01 Apr 2025 02:38:46 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"9ed2f11894c3c000a36044c33a6d2161"',
   'x-amz-checksum-crc32': '7GJ8sw==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"9ed2f11894c3c000a36044c33a6d2161"',
 'ChecksumCRC32': '7GJ8sw==',
 'ChecksumType': 'FULL_OBJECT',
 'ServerSideEncryption': 'AES256'}