In [1]:
import sagemaker
import boto3
import pandas as pd
from io import StringIO
import ast

In [2]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)
s3 = boto3.Session().client(service_name="s3", region_name=region)

### Get library for every user

Set the user_id and gameid for all users

In [3]:
def load_data(file_key):    
    bucket_name = 'ads-508-final'
    file_key = f"playstation/{file_key}/{file_key}.csv"
    
    response = s3.get_object(Bucket=bucket_name, Key=file_key)
    
    csv_content = response['Body'].read().decode('utf-8')
    
    csv_file = StringIO(csv_content)
    
    df = pd.read_csv(csv_file)

    return df

In [4]:
df_purch = load_data("purchased_games")

In [5]:
def conv_str_exp(df, col):
    """Convert columns type if needed then explode"""
    
    df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df_exploded = df.explode(col, ignore_index=True)
    return df_exploded

In [6]:
df_player = conv_str_exp(df_purch, 'library')

### Add game meta data
Add additional metadata about the game for training

In [7]:
df_games = load_data("games")

In [8]:
df_games = df_games.drop(columns = ['release_date', 'title', 'publishers'])

In [9]:
df_games_meta = pd.merge(df_player, df_games, left_on='library', right_on='gameid', how='left')

In [10]:
# df_games_meta['gameid'] = df_games_meta['gameid'].astype('Int64')

In [11]:
df_games_meta['player_game_id'] = df_games_meta['playerid'].astype('str') + "_" + df_games_meta['library'].astype('str')

#### Keep only records with game metadata available

In [12]:
df_games_meta[df_games_meta['gameid'].isna()].sample(5)

Unnamed: 0,playerid,library,gameid,platform,developers,genres,supported_languages,player_game_id
6128458,3014524,577777,,,,,,3014524_577777
6296009,301510,442072,,,,,,301510_442072
7781040,2698601,3906,,,,,,2698601_3906
4521547,1307766,717,,,,,,1307766_717
6644088,369475,4994,,,,,,369475_4994


In [13]:
df_games_meta.dropna(subset=['gameid'], inplace=True)

### Derive interaction characteristics from achievements data

### Acheivement features to extract
1. Time between first and last trophy pergame per user and use an an approximate for game engagement
2. Number of trophies per user per game
3. Define user_game to join back to main dataset

We will ignore trophy metadata for now and focus

In [14]:
df_hist = load_data("history")

In [15]:
df_hist['game'] = df_hist['achievementid'].str.split('_').str[0]

In [16]:
df_hist['date_acquired'] = pd.to_datetime(df_hist['date_acquired'])

In [17]:
df_hist_group = df_hist.groupby(['playerid', 'game']).agg({
    'date_acquired': ['min', 'max'],
    'achievementid': 'nunique'
}).reset_index()

In [18]:
df_hist_group.columns = ['playerid', 'game', 'date_acquired_min', 'date_acquired_max', 'unique_achievements']

In [19]:
df_hist_group['days_played'] = (df_hist_group['date_acquired_max'] - df_hist_group['date_acquired_min']).dt.days + 1

In [20]:
df_hist_group.head()

Unnamed: 0,playerid,game,date_acquired_min,date_acquired_max,unique_achievements,days_played
0,77,1,2011-08-31 10:15:59,2011-09-06 09:26:33,25,6
1,77,10020,2015-09-02 12:00:08,2017-05-10 05:33:15,26,616
2,77,10246,2016-04-05 02:32:19,2016-04-05 02:32:19,1,1
3,77,104,2010-03-05 03:56:03,2010-06-26 01:05:25,47,113
4,77,10454,2016-06-26 13:59:52,2016-06-26 13:59:52,1,1


In [21]:
df_hist_group['player_game_id'] = df_hist_group['playerid'].astype('str') + "_" + df_hist_group['game'].astype('str')

#### Subset features we will use for modeling and add to primary dataframe

In [22]:
player_game_int = df_hist_group[['player_game_id', 'unique_achievements', 'days_played']]

In [23]:
player_game_int.head()

Unnamed: 0,player_game_id,unique_achievements,days_played
0,77_1,25,6
1,77_10020,26,616
2,77_10246,1,1
3,77_104,47,113
4,77_10454,1,1


In [24]:
df_games_meta_int = pd.merge(df_games_meta, player_game_int, left_on='player_game_id', right_on='player_game_id', how='left')

In [25]:
df_games_meta_int_f = df_games_meta_int[['playerid', 'gameid', 'platform', 'developers', 'genres', 'supported_languages', 'unique_achievements', 'days_played']]

In [26]:
df_games_meta_int_f['unique_achievements'].fillna(0, inplace=True)
df_games_meta_int_f['days_played'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_games_meta_int_f['unique_achievements'].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_games_meta_int_f['unique_achievements'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=Tr

In [4]:
df_games_meta_int_f.head()

### Get complete dataset to s3, then split as needed

In [5]:
from sklearn.model_selection import train_test_split

# Splitting the data
train_df, test_df = train_test_split(df_games_meta_int_f, test_size=0.2, random_state=42)

# Upload S3 Func
def upload_to_s3(df, bucket_name, file_key):
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)
    s3.put_object(Bucket=bucket_name, Key=file_key, Body=csv_buffer.getvalue())

#Keys and Uploading to S3
train_file_key = 'playstation/train_data.csv'
test_file_key = 'playstation/test_data.csv'

upload_to_s3(train_df, bucket, train_file_key)
upload_to_s3(test_df, bucket, test_file_key)

# Paths
print(f"Train dataset uploaded to: s3://{bucket}/{train_file_key}")
print(f"Test dataset uploaded to: s3://{bucket}/{test_file_key}")