In [1]:
# import standard stuff

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gzip         # reading compressed files.
import ast          # evaluating literal expressions

# Adjusting some visuals

pd.set_option('display.max_colwidth', 200)  # Set the maximum width of a column to display (None for unlimited)

pd.set_option('display.max_rows', 6)  # Set the maximum number of rows to display

pd.set_option('display.max_columns', None)  # Set the maximum number of columns to display (None for unlimited)

In [2]:
# first tried to open gz file with the precieding procedure. It was impossible due to falied encoding.
# Introducing gzip and ast solved the problem.

# both user_reviews and users_items are in the same situation, so i put the open procedure inside a function:

def open_jsongz(path):

    rows = []

    with gzip.open(path, 'rt', encoding='utf-8') as file:
        for line in file.readlines():
            rows.append(ast.literal_eval(line)) # is used to safely interpret the content of each line as a Python literal (likely a dictionary).

    return  pd.DataFrame(rows)

users_items = open_jsongz('../data/users_items.json.gz')

In [4]:
users_items.drop(['items_count','steam_id', 'user_url'], axis=1)

Unnamed: 0,user_id,items
0,76561197970982479,"[{'item_id': '10', 'item_name': 'Counter-Strike', 'playtime_forever': 6, 'playtime_2weeks': 0}, {'item_id': '20', 'item_name': 'Team Fortress Classic', 'playtime_forever': 0, 'playtime_2weeks': 0}..."
1,js41637,"[{'item_id': '10', 'item_name': 'Counter-Strike', 'playtime_forever': 0, 'playtime_2weeks': 0}, {'item_id': '80', 'item_name': 'Counter-Strike: Condition Zero', 'playtime_forever': 0, 'playtime_2w..."
2,evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchestra: Ostfront 41-45', 'playtime_forever': 923, 'playtime_2weeks': 0}, {'item_id': '1230', 'item_name': 'Mare Nostrum', 'playtime_forever': 0, 'playtime..."
...,...,...
88307,XxLaughingJackClown77xX,[]
88308,76561198329548331,"[{'item_id': '304930', 'item_name': 'Unturned', 'playtime_forever': 677, 'playtime_2weeks': 677}, {'item_id': '227940', 'item_name': 'Heroes & Generals', 'playtime_forever': 43, 'playtime_2weeks':..."
88309,edward_tremethick,[]


In [4]:
unpacked_list = []

# First 4 columns refer to df, the 4 columns left, are keys indide dictionary for 4th df column
columns_names = ['user_id','items_count','steam_id','user_url', 'item_id', 'item_name', 'playtime_forever', 'playtime_2weeks']

def get_data(row):
    for users_items in row['items']:
        unpacked_list.append([
            row[columns_names[0]],
            row[columns_names[1]],
            row[columns_names[2]],
            row[columns_names[3]],
            users_items[columns_names[4]],
            users_items[columns_names[5]],
            users_items[columns_names[6]],
            users_items[columns_names[7]]])

users_items.apply(get_data, axis=1)

unpacked = pd.DataFrame(unpacked_list, columns=columns_names)


In [5]:
unpacked_list = []

# First colum refer to df, the 3 columns left, are keys indide dictionary for 4th df column
columns_names = ['user_id', 'item_id', 'item_name', 'playtime_forever']

def get_data(row):
    for users_items in row['items']:
        unpacked_list.append([
            row[columns_names[0]],
            users_items[columns_names[1]],
            users_items[columns_names[2]],
            users_items[columns_names[3]]])

users_items.apply(get_data, axis=1)

unpacked = pd.DataFrame(unpacked_list, columns=columns_names)

In [6]:
unpacked

Unnamed: 0,user_id,item_id,item_name,playtime_forever
0,76561197970982479,10,Counter-Strike,6
1,76561197970982479,20,Team Fortress Classic,0
2,76561197970982479,30,Day of Defeat,7
...,...,...,...,...
5153206,76561198329548331,388490,One Way To Die: Steam Edition,3
5153207,76561198329548331,521570,You Have 10 Seconds 2,4
5153208,76561198329548331,519140,Minds Eyes,3


In [8]:
light_unpacked = unpacked[['user_id', 'item_id', 'playtime_forever']]

In [23]:
light_unpacked.to_csv('../light_data/function2.csv', index=False)

In [9]:
light_unpacked.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5153209 entries, 0 to 5153208
Data columns (total 3 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   user_id           object
 1   item_id           object
 2   playtime_forever  int64 
dtypes: int64(1), object(2)
memory usage: 117.9+ MB


In [6]:
unpacked.isna().sum()

user_id             0
items_count         0
steam_id            0
                   ..
item_name           0
playtime_forever    0
playtime_2weeks     0
Length: 8, dtype: int64

In [7]:
# Group by the 'item_id' column and sum the 'playtime_forever' column
item_id_playtime = unpacked.groupby('item_id')['playtime_forever'].sum().reset_index()

In [8]:
item_id_playtime

Unnamed: 0,item_id,playtime_forever
0,10,17386015
1,100,311999
2,10000,63046
...,...,...
10975,99900,17259605
10976,99910,426680
10977,99920,2344


In [9]:
data = []
def convert_minutes_to_hours_and_minutes(row):
    item_id = row['item_id']
    hours = row['playtime_forever'] // 60
    remaining_minutes = row['playtime_forever'] % 60
    return data.append([item_id, hours, remaining_minutes])

item_id_playtime.apply(convert_minutes_to_hours_and_minutes,axis=1)

hors_and_minutes_by_Item_id = pd.DataFrame(data, columns = ['item_id', 'hours', 'minutes'])
hors_and_minutes_by_Item_id

Unnamed: 0,item_id,hours,minutes
0,10,289766,55
1,100,5199,59
2,10000,1050,46
...,...,...,...
10975,99900,287660,5
10976,99910,7111,20
10977,99920,39,4


In [10]:
hors_and_minutes_by_Item_id.to_csv('../light_data/item_id&playtime.csv', index=False)