# Dataset Preliminary Transformations

## ... from Raw to Complete and Definitive

### Import Raw Dataset

In [31]:
import json

# Carica il dataset JSON raw
with open("./../dataset/raw_games_dataset.json", "r") as file:
    json_data_raw = json.load(file)

### API request with wrapper

In [2]:
import os
from dotenv import load_dotenv
from igdb.wrapper import IGDBWrapper

# Carica le variabili d'ambiente
load_dotenv("./.env")

wrapper = IGDBWrapper(
    os.getenv("LSD_CLIENT_ID").strip(), os.getenv("LSD_ACCESS_TOKEN").strip()
)

### Raw to Definitive dataset conversion

In [32]:
# Conversione dataset RAW in dataset definitivo
import json


def transform_json_structure(input_file):
    try:
        # Trasformare il dizionario in una lista di oggetti
        transformed_data = [value for key, value in input_file.items()]
        return transformed_data

    except Exception as e:
        print(f"Si è verificato un errore: {e}")

# Chiamare la funzione per trasformare la struttura del JSON
json_data = transform_json_structure(json_data_raw)

# Salvare il nuovo JSON
#with open("./../dataset/games_dataset_definitive.json", 'w', encoding='utf-8') as file:
#    json.dump(json_data, file, indent=4)

### Companies retieval

1. Get all of the companies (with the list of the games they developed) from the IGDB API.
2. For each company, for each game it developed, access the dataset and add company name and company id as nested object under the key "companies" (a game can have more companies)

In [5]:
def get_company_for_games(offset):
    """
    Get companies from IGDB API with a certain offset
    """
    try:
        result = wrapper.api_request(
            "companies",
            f"fields id, name, developed; limit 500; sort id asc; offset {offset};",
        ).decode("utf-8")
    except Exception as e:
        print("Errore nella richiesta API:", e)

    return json.loads(result)


group_size = 500
game_ids = [elem["id"] for elem in json_data]
companies = {}
offset = 0
while True:
    companies_buff = get_company_for_games(offset)
    for elem in companies_buff:
        companies[elem["id"]] = elem
    if len(companies_buff) < group_size:
        break
    offset += 500

# salvataggio companies su file
# with open("../dataset/companies.json", "w") as f:
#     f.write(json.dumps(companies, indent=4))

In [33]:
# eventually retrieve companies from file
with open("../dataset/companies.json", "r") as f:
    companies = json.loads(f.read())

In [34]:
"""
For each company, we iterate through the "developed" array, which represent the games developed by that company.
We then add a new key "company" to the original dataset's game objects and assign it a list of {company_id, company_name} dictionaries.
"""
for company in companies.values():
    company_obj = {"company_id": company["id"], "company_name": company["name"]}
    if "developed" not in company:
        continue
    for developed_game in company["developed"]:
        stringified_game_id = str(developed_game)
        if stringified_game_id not in json_data_raw:
            continue
        if "companies" in json_data_raw[stringified_game_id]:
            json_data_raw[stringified_game_id]["companies"].append(company_obj)
        else:
            json_data_raw[stringified_game_id]["companies"] = [company_obj]

# salvataggio dataset con companies su file
#with open("./../dataset/raw_dataset_with_companies.json", "w") as f:
#    f.write(json.dumps(json_data_raw, indent=4))

### Languages rertieval

In [7]:
def get_language_support(offset):
    """
    Get language supports from IGDB API with a certain offset
    """
    try:
        result = wrapper.api_request(
            'language_supports', 
            f'fields id, language; limit 500; sort id asc; offset {offset};'
        ).decode('utf-8')
    except Exception as e:
        print("Errore nella richiesta API:", e)

    return json.loads(result)

def get_languages(offset):
    """
    Get languages from IGDB API with a certain offset
    """
    try:
        result = wrapper.api_request(
            'languages', 
            f'fields id, name; limit 500; sort id asc; offset {offset};'
        ).decode('utf-8')
    except Exception as e:
        print("Errore nella richiesta API:", e)

    return json.loads(result)

In [9]:
group_size = 500
languages_mapping = {}
offset = 0
while True:
    buffer = get_languages(offset)
    for elem in buffer: languages_mapping[elem["id"]] = elem["name"]
    if len(buffer) < group_size: break
    offset += 500

print(f"Fetched {len(languages_mapping)} languages")

NameError: name 'get_languages' is not defined

In [9]:
import threading
import time
def fetch_language_data(start, group_size, lock):
    while True:
        t = time.time()
        buffer = get_language_support(start)
        elapsed = time.time() - t
        to_sleep = 0.650 - elapsed
        with lock:
            for elem in buffer:
                language_supports_mapping[elem["id"]] = languages_mapping[elem["language"]]
        
        if len(buffer) < group_size:
            break
        if to_sleep > 0: time.sleep(to_sleep)
        start += group_size * 3
        print(start)

group_size = 500
language_supports_mapping = {}
lock = threading.Lock()
threads = []
for i in range(3):
    thread = threading.Thread(target=fetch_language_data, args=(i * group_size, group_size, lock))
    threads.append(thread)
    thread.start()

for thread in threads:
    thread.join()

# salvataggio language_supports_mapping su file
#with open("../dataset/language_supports_mapping.json", "w") as f:
#    f.write(json.dumps(language_supports_mapping, indent = 4))

2000
1500
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
1050011000

10000
12000
11500
12500
13500
14000
13000
14500
15000
15500
16000
17000
16500
17500
18500
18000
20000
19000
19500
21500
20500
21000
23000
22500
22000
24500
24000
23500
26000
25500
25000
27500
27000
26500
29000
28500
28000
30500
30000
29500
32000
31500
31000
33500
33000
32500
35000
34500
34000
36500
36000
35500
38000
37500
37000
39500
39000
38500
41000
40500
40000
42500
42000
41500
44000
43500
43000
45500
45000
47000
44500
46500
48500
46000
48000
47500
50000
49500
51500
49000
51000
50500
53000
52500
54500
52000
54000
53500
56000
55500
55000
57500
57000
56500
59000
58500
58000
60500
60000
59500
61500
62000
61000
63000
63500
62500
64500
65000
64000
66000
66500
65500
67500
68000
67000
69000
69500
68500
70500
71000
70000
72000
72500
71500
73500
74000
73000
75000
75500
74500
76500
76000
77000
78000
77500
78500
79500
80000
79000
81000
81500
80500
82500
83000
82000
84000
83500
84500
85500
85000
860

In [35]:
# eventually retrieve language_supports_mapping from file
with open("../dataset/language_supports_mapping.json", "r") as f:
    language_supports_mapping = json.loads(f.read())

In [36]:
print(f"Fetched {len(language_supports_mapping)} language mappings.")

Fetched 824025 language mappings.


In [37]:
i = 0
for game in json_data_raw:
    if "language_supports" in json_data_raw[game]:
        json_data_raw[game]["language_supports"] = list({language_supports_mapping[x] if x in language_supports_mapping else "English" for x in json_data_raw[game]["language_supports"]})
        #print(i)
        #i+=1

#with open("./../dataset/raw_dataset_with_companies_lang.json","w") as f:
#    f.write(json.dumps(json_data_raw, indent = 4))

### Age Ratings Retrieval

In [None]:
def get_age_rating(offset):
    """
    Get age ratings from IGDB API with a certain offset
    """
    try:
        result = wrapper.api_request(
            'age_ratings', 
            f'fields id, category, rating; limit 500; sort id asc; offset {offset};'
        ).decode('utf-8')
    except Exception as e:
        print("Errore nella richiesta API:", e)

    return json.loads(result)

In [None]:
group_size = 500
age_ratings_mapping = {}
offset = 0
while True:
    buffer = get_age_rating(offset)
    for elem in buffer: age_ratings_mapping[elem["id"]] = {"age_rating_category": elem["category"], "age_rating_rating": elem["rating"]}
    if len(buffer) < group_size: break
    offset += 500
    print(offset)

print(f"Fetched {len(age_ratings_mapping)} age_ratings")

# salvataggio age_ratings_mapping su file
#with open("../dataset/age_ratings_mapping.json", "w") as f:
#    f.write(json.dumps(age_ratings_mapping, indent = 4))

In [38]:
# eventually retrieve age_ratings_mapping from file
with open("../dataset/age_ratings_mapping.json", "r") as f:
    age_ratings_mapping = json.loads(f.read())

In [39]:
for game_id, game in json_data_raw.items():
    if "age_ratings" in game:
        json_data_raw[game_id]["age_ratings"] = [age_ratings_mapping[str(age_rating_id)] for age_rating_id in game["age_ratings"] if str(age_rating_id) in age_ratings_mapping]

#with open("./../dataset/raw_dataset_with_companies_lang_age_rating.json","w") as f:
#    f.write(json.dumps(json_data_raw, indent = 4))

### Legend Mapping 

Map categorical fields into the relative values parsed manually from IGDB.

In [41]:
with open("../dataset/legend.json", "r") as f:
    legend = json.load(f)
for game_id, game in json_data_raw.items():
    for legend_key, legend_mapping in legend.items():
        if legend_key in game:
            if isinstance(game[legend_key], list):
                json_data_raw[game_id][legend_key] = [
                    {
                        legend_key + "_id": str(id),
                        legend_key + "_value": legend_mapping[str(id)],
                    }
                    for id in game[legend_key]
                    if str(id) in legend_mapping
                ]
            else:
                json_data_raw[game_id][legend_key] = {
                    legend_key + "_id": str(game[legend_key]),
                    legend_key + "_value": legend_mapping[str(game[legend_key])],
                }
    if "age_ratings" in game:
        json_data_raw[game_id]["age_ratings"] = [
            {
                "age_rating_category": {
                    "age_rating_category_id": age_rating["age_rating_category"],
                    "age_rating_category_value": legend["age_rating_category"][str(age_rating["age_rating_category"])],
                },
                "age_rating_rating": {
                    "age_rating_rating_id": age_rating["age_rating_rating"],
                    "age_rating_rating_value": legend["age_rating_rating"][str(age_rating["age_rating_rating"])],
                }
            }
            for age_rating in game['age_ratings']
        ]

In [42]:
print(json.dumps(list(json_data_raw.values())[:25] , indent = 4))

[
    {
        "id": 231577,
        "age_ratings": [
            {
                "age_rating_category": {
                    "age_rating_category_id": 2,
                    "age_rating_category_value": "PEGI"
                },
                "age_rating_rating": {
                    "age_rating_rating_id": 4,
                    "age_rating_rating_value": "Sixteen"
                }
            },
            {
                "age_rating_category": {
                    "age_rating_category_id": 1,
                    "age_rating_category_value": "ESRB"
                },
                "age_rating_rating": {
                    "age_rating_rating_id": 10,
                    "age_rating_rating_value": "T"
                }
            },
            {
                "age_rating_category": {
                    "age_rating_category_id": 7,
                    "age_rating_category_value": "ACB"
                },
                "age_rating_rating": {
                    "ag

In [43]:
# salvataggio dataset finale su file
with open("./../dataset/raw_dataset_post_age_ratings.json","w") as f:
    f.write(json.dumps(json_data_raw, indent = 4))

### Posted Date Formatting

In [44]:
# Chiamare la funzione per trasformare la struttura del JSON
json_data = transform_json_structure(json_data_raw)

In [45]:
import re
from datetime import datetime

# Funzione per pulire e riformattare la data
def clean_and_format_date(date_str):
    # Rimuovi caratteri speciali e parole non necessarie
    cleaned_date = re.sub(r'[\t\n\r]|Steam Key', '', date_str).strip()

    # Tenta di convertire la data nel formato desiderato
    try:
        # Se la data ha solo mese e giorno, aggiungi l'anno corrente (2023)
        if len(cleaned_date.split(',')) == 1:
            cleaned_date += ', 2023'
        # Riformatta la data
        formatted_date = datetime.strptime(cleaned_date, '%d %B, %Y').strftime('%Y-%m-%d')
        return formatted_date
    except ValueError:
        # Se c'è un errore nella conversione, restituisci la stringa originale
        return date_str

# Elabora ogni elemento del dataset
for item in json_data:
    if 'reviews' in item and isinstance(item['reviews'], list):
        for review in item['reviews']:
            if 'postedDate' in review:
                review['postedDate'] = clean_and_format_date(review['postedDate'])

In [46]:
# Salvare il nuovo JSON
with open("./../dataset/games_dataset_definitive.json", 'w', encoding='utf-8') as file:
    json.dump(json_data, file, indent=4)

In [47]:
import json

# Carica il dataset JSON definitivo
with open('./../dataset/games_dataset_definitive.json', 'r') as file:
    json_data = json.load(file)

### Extract subset of fields from dataset

In [48]:
import json

# List of keys to keep in the filtered dataset
keys_to_keep = [
    "age_ratings",
    "category",
    "first_release_date",
    "game_modes",
    "genres",
    "id",
    "name",
    "companies",
    "language_supports",
    "platforms",
    "reviews",
    "status",
    "summary",
    "total_rating",
    "total_rating_count",
]


#def filter_dataset(dataset, keys):
#    # Filter the dataset
#    filtered_dataset = [{key: item.get(key) for key in keys} for item in dataset]
#    return filtered_dataset

def filter_dataset(dataset, keys):
    # Check for non-null values before filtering
    non_null_counts = {key: 0 for key in keys_to_keep}
    for item in dataset:
        for key in keys_to_keep:
            if key in item and item[key] is not None:
                non_null_counts[key] += 1

    # Print the non-null counts
    for key, count in non_null_counts.items():
        print(f'Key "{key}" has {count} non-null values.')

    # Filter the dataset
    filtered_dataset = [{key: item.get(key) for key in keys if key in item} for item in dataset]
    return filtered_dataset

# Invoke the filter function
json_data_filtered = filter_dataset(json_data, keys_to_keep)

Key "age_ratings" has 58661 non-null values.
Key "category" has 259290 non-null values.
Key "first_release_date" has 164967 non-null values.
Key "game_modes" has 139982 non-null values.
Key "genres" has 209590 non-null values.
Key "id" has 259290 non-null values.
Key "name" has 259290 non-null values.
Key "companies" has 99957 non-null values.
Key "language_supports" has 122872 non-null values.
Key "platforms" has 174749 non-null values.
Key "reviews" has 88016 non-null values.
Key "status" has 16300 non-null values.
Key "summary" has 220787 non-null values.
Key "total_rating" has 34899 non-null values.
Key "total_rating_count" has 34899 non-null values.


In [49]:

import json
# eventually save the filtered dataset to a file
with open("../dataset/games_dataset_filtered.json", 'w') as file:
    json.dump(json_data_filtered, file, indent=4)

### Mapping the clustered authors

In [50]:
import json
import pandas as pd
import random

# Load the mappings from the CSV
mapping_df = pd.read_csv('../dataset/Mapping-Authors.csv')
# Create a dictionary from the mappings DataFrame
author_mapping = dict(zip(mapping_df['original'], mapping_df['clustered']))

# Function to update the author names in the reviews
def update_author_names(dataset, author_mapping):
    for item in dataset:
        # Check if 'reviews' key exists and is a list
        if 'reviews' in item and isinstance(item['reviews'], list):
            for review in item['reviews']:
                # If the author of a review is in the mapping, update the name
                if 'author' in review and (review['author'] in author_mapping or review['author'] == ''):
                    review['author'] = random.choice(list(author_mapping.values())) if review["author"] == "" else str(author_mapping[review['author']])
    return dataset

# Update the author names
updated_dataset = update_author_names(json_data_filtered, author_mapping)

In [51]:
# eventually save the mapped dataset to a file
with open("../dataset/games_dataset_filtered_mapped.json", 'w', encoding='utf-8') as file:
    json.dump(updated_dataset, file, ensure_ascii=False, indent=4)

In [52]:
import json

# Carica il dataset JSON definitivo
with open('../dataset/games_dataset_filtered_mapped.json', 'r', encoding='utf-8') as file:
    json_data = json.load(file)

## Preliminary analysis

### List of the fields

Before diving into the code, let's give a look to the fields and the type of data that each of them contains (The following code performs a kind of SELECT DISTINCT of the fields):

**NOTE!** 🟢
The field "*Example Content*" contains 5 random samples of the field content. These samples are not aggregated.

| **FIELD** 	| **TYPE** 	| **TO PARSE** 	| **Are we going to use it?** 	| **Example Content** 	|
|---	|:---:	|:---:	|:---:	|---	|
| age_ratings 	|  	| 🟠 	| ✅ 	| [[114607, 114609, 126972, 126973, 126974, 127011], [39395, 54394], [65532], [67799], [57121, 58326, 91135, 93494, 110841, 126135]] 	|
| aggregated_rating 	| float 	|  	| ❌ 	| [65.0, 73.25, 82.5, 69.0, 70.93333333333334] 	|
| aggregated_rating_count 	|  	|  	| ❌ 	| [3, 4, 3, 1, 17] 	|
| alternative_names 	|  	|  	| ❌ 	| [[36941, 36942, 127468], [65648, 105850, 117666], [120224], [97398], [106570, 116583]] 	|
| artworks 	|  	|  	| ❌ 	| [[96775], [14195], [9728], [115992], [11731, 68424]] 	|
| bundles 	|  	|  	| ❌ 	| [[263897], [230749, 230823, 230826], [251322], [29208], [136262, 164674, 164778, 219003, 230498, 230702]] 	|
| category 	|  	| 🟢 	| ✅ 	| [0, 0, 0, 0, 0] 	|
| checksum 	|  	|  	| ❌ 	| ['4e87d634-9146-2025-b09b-9e4ea1e04764', '00e38d21-91cd-3082-4737-7978c44e33e2', '40916ead-d6e4-99f3-ab54-d653605a984b', '489ad4d0-e2f2-09e0-3bae-a76091559c44', 'bc043ffb-ad60-6bd0-801b-55a6141e773c'] 	|
| collection 	|  	| 🟠 	|  	| [602, 2449, 7103, 7856, 559] 	|
| collections 	|  	| 🟠 	|  	| [[602], [18], [18], [39, 7197], [39]] 	|
| cover 	|  	|  	| ❌ 	| [280467, 247255, 272120, 192106, 326638] 	|
| created_at 	|  	|  	| ❌ 	| [1673878007, 1621112209, 1499434349, 1517405808, 1521818623] 	|
| dlcs 	| game_id 	|  	|  	| [[171610], [8219, 222679, 239352], [172092], [212363, 212364, 212365], [140056, 140057, 140058]] 	|
| expanded_games (?) 	| game_id 	|  	|  	| [[135286], [30251], [78807], [57658], [124492]] 	|
| expansions (?) 	| game_id 	|  	|  	| [[171428], [19540, 19551, 26852], [44596], [171147], [18442]] 	|
| external_games 	|  	| game_id of other platforms 	| ❌ 	| [[2639959, 2677882, 2677897, 2677919, 2678595], [2011978], [51807, 154169, 1867507], [221231, 1748471, 1991656], [1989881]] 	|
| first_release_date 	|  	|  	| ✅ 	| [1677110400, 1610928000, 1019001600, 1559952000, 1644796800] 	|
| follows 	|  	| followers of the game 	|  	| [1, 1, 1, 63, 244] 	|
| forks (?) 	|  	|  	|  	| [[79074, 109134, 133635, 147176, 151170, 176876, 197828, 246736, 255413], [235313], [141686], [67248], [142289]] 	|
| franchise 	|  	| 🟠 	|  	| [361, 9, 102, 299, 105] 	|
| franchises 	|  	| 🟠 	|  	| [[1651, 3134], [872, 874], [3390], [509, 977], [1351]] 	|
| game_engines 	|  	| 🟠 	| ❌ 	| [[1543], [427], [439], [72], [10]] 	|
| game_localizations 	|  	| 🟠 	| ❌ 	| [[2026, 13450], [4006, 19440], [14535], [1863, 3984], [11783, 20873]] 	|
| game_modes 	|  	| 🟢 	| ✅ 	| [[1, 2], [1, 2], [1], [1], [1]] 	|
| genres 	|  	| 🟢 	| ✅ 	| [[15], [4, 31, 33], [7, 13, 33], [9, 33], [31, 32]] 	|
| hypes 	|  	| number of followers before release date 	| ❌ 	| [1, 1, 26, 2, 2] 	|
| id 	| game_id 	| id_of_IGDB 	| ✅ 	| [231577, 147666, 44711, 85450, 95080] 	|
| companies 	|  	| 🟢 	| ✅ 	|   	|
| keywords 	|  	|  	| ❌ 	| [[1699, 2152, 4137, 4438, 5029, 5188, 5189, 10098, 13115], [101], [78, 394, 960, 1033, 2458], [30750, 33125], [1308, 3257]] 	|
| language_supports 	|  	| 🟠 	| ✅ 	| [[504844], [784416, 784417, 784418, 784419], [108775, 108776], [430843, 430844, 430845], [46076, 46078, 46080, 46082, 46084, 46086, 46088, 46090, 46092, 46094, 46096, 46098, 46100, 46102, 46104, 46106, 46108, 46110, 46112, 46114, 148503, 148505, 148507, 148513, 148515]] 	|
| multiplayer_modes 	|  	|  	| ❌ 	| [[11174, 11175], [18654], [20006], [7037, 7982, 12750, 21945], [22267]] 	|
| name 	|  	|  	| ✅ 	| ['Blood Bowl 3: Black Orcs Edition', 'Shinobi Blade', 'DDRMax2: Dance Dance Revolution', 'Transformers Prime: The Game', 'Dotra'] 	|
| parent_game 	| game_id 	|  	|  	| [110980, 46584, 198295, 231152, 107843] 	|
| platforms 	|  	| 🟢 	| ✅ 	| [[49, 169], [130], [8, 52], [34, 39], [6]] 	|
| player_perspectives 	|  	|  	| ❌ 	| [[6], [2, 4], [1, 7], [1], [2]] 	|
| ports 	|  	|  	| ❌ 	| [[73949, 144894], [10], [251353], [122871], [8809]] 	|
| rating 	|  	|  	| ❌ 	| [70.0, 78.83709512164174, 60.0, 40.0, 50.0] 	|
| rating_count 	|  	|  	| ❌ 	| [0, 102, 0, 0, 0] 	|
| release_dates 	|  	|  	| ❌ 	| [[455316, 455317], [249798, 340666], [201923, 201924, 201925], [511087], [345917, 387770, 387772, 387773, 389364, 406085]] 	|
| remakes 	|  	|  	| ❌ 	| [[18189], [222995], [93192], [1181], [245057]] 	|
| remasters 	|  	|  	| ❌ 	| [[265060], [145817, 158981], [11218], [203435], [267307]] 	|
| reviews.author 	|  	|  	| ✅ 	| ['maskedgingerjock', 'MisterEcho', 'Ziggy The Adventurer', 'Obey the Fist!', 'Sixth'] 	|
| reviews.postedDate 	|  	|  	| ✅ 	| ['18 September', '4 September', '3 September', '31 March, 2022\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tSteam Key', '21 September, 2022'] 	|
| reviews.recommended 	|  	|  	| ✅ 	| [False, True, None, False, True] 	|
| reviews.review 	|  	|  	| ✅ 	| ['This game is horrible. This game is boring.You can get both endings in under 10 minutes.There is no ...', 'https://youtu.be/dbbmoI0O5UgPretty nice game if you like indie walking simulator games with some jum...', "Plays fine, looks okay, very limited options, no body, can't jump or crouch, can be finished in abou...", 'Warning: Asset Flip!Pet Puzzle is a Unity Asset flip, what Valve calls a "fake game". 	|
| screenshots 	|  	|  	| ❌ 	| [[1002202, 1002203, 1002204, 1002205, 1002206, 1002207], [447897, 447898, 447899, 447900, 447901], [338676, 338677, 338678, 390639, 390640], [1128097, 1128099, 1128104, 1128107, 1128111], [311510, 311511, 311512, 311513, 311514]] 	|
| similar_games 	| game_id 	|  	|  	| [[17613, 24620, 36269, 36346, 65827, 76340, 77038, 77597, 111043, 112754], [10605, 56033, 87622, 96217, 101608, 103292, 103369, 111130, 114145, 117533], [3679, 18981, 27378, 38967, 43218, 43488, 43680, 63933, 73042, 76901], [18115, 19222, 25905, 41349, 85804, 87170, 87507, 90788, 90965, 95776], [13189, 25222, 27266, 55282, 75948, 87975, 96217, 106992, 111130, 121217]] 	|
| slug 	|  	|  	| ❌ 	| ['blood-bowl-3-black-orcs-edition', 'shinobi-blade', 'ddrmax2-dance-dance-revolution', 'transformers-prime-the-game', 'dotra'] 	|
| standalone_expansions 	|  	|  	| ❌ 	| [[146969], [3735], [27898], [203628], [128415]] 	|
| status 	|  	| 🟢 	| ✅ 	| [8, 4, 4, 4, 4] 	|
| steam 	|  	|  	| ❌ 	| ['2557720', '1092080', '1667954', '1851020', '2009730'] 	|
| storyline 	|  	|  	| ❌ 	| ['In the ancient, mysterious world of the Jade Empire, you train under your master’s watchful eye and ...', 'From the frigid climes of Norsca to the arid wastes of the southern Badlands, The Old World echoes t...', 'You live in a peaceful world that has triumphed over a giant alien invasion. To pay off your debts, ...', "You who are unqualified to drink Meng Po's soup, you have been chosen for the game of life. Three wo...", 'Following the end of the Galactic Civil War, the bright lights of The Arena burst into life and offe...'] 	|
| summary 	|  	|  	| ✅ 	| ["Fashion is not exactly a priority for Black Orcs… but intimidation, on the other hand, now we're tal...", 'Shinobi Blade is an action-packed game, lets you play the role of a teenage ninja who sneaked out of...', 'The dance floor kicks into overdrive with DDRMAX2 Dance Dance Revolution. It is the latest installme...', 'Shoot bubbles and match colors to pop your way up to victory in this bubble shooting adventure, win ...', 'A missing father, a mysterious murder, paintings depicting missing children. Horror game in which th...'] 	|
| tags 	|  	|  	| ❌ 	| [[268435471], [1, 268435460, 268435487, 268435489], [40, 268435463, 268435469, 268435489, 536872611, 536873064, 536875049, 536875350, 536875941, 536876100, 536876101, 536881010, 536884027], [268435465, 268435489], [268435487, 268435488]] 	|
| themes 	|  	|  	| ❌ 	| [[1], [40], [17, 22, 27], [1], [1]] 	|
| total_rating 	|  	|  	| ✅ 	| [65.0, 70.0, 73.25, 80.66854756082087, 60.0] 	|
| total_rating_count 	|  	|  	| ✅ 	| [3, 0, 4, 105, 0] 	|
| updated_at 	|  	|  	| ❌ 	| [1699266775, 1698219001, 1696215459, 1670993154, 1670992528] 	|
| url 	|  	|  	| ❌ 	| ['https://www.igdb.com/games/blood-bowl-3-black-orcs-edition', 'https://www.igdb.com/games/shinobi-blade', 'https://www.igdb.com/games/ddrmax2-dance-dance-revolution', 'https://www.igdb.com/games/transformers-prime-the-game', 'https://www.igdb.com/games/dotra'] 	|
| version_parent 	|  	|  	| ❌ 	| [138137, 24115, 139736, 138137, 28856] 	|
| version_title 	|  	|  	| ❌ 	| ['Black Orcs Edition', 'Platinum Edition', "Collector's Edition", 'Imperial Nobility Edition', 'Digital Deluxe Edition'] 	|
| videos 	|  	|  	| ❌ 	| [[37984], [64610], [28099], [35464], [28553]] 	|
| websites 	|  	|  	| ❌ 	| [[455726], [145653, 145654, 145655, 354472], [445624], [429116], [407422]] 	|

In [None]:
def explore_keys_with_examples(data, path='', examples=None):
    """
    Funzione ricorsiva per esplorare le chiavi in un dataset JSON e raccogliere esempi.
    
    :param data: Il dataset o una parte di esso.
    :param path: Il percorso corrente delle chiavi.
    :param examples: Un dizionario che tiene traccia degli esempi per ogni chiave.
    :return: None. Modifica il dizionario `examples` direttamente.
    """
    if examples is None:
        examples = {}

    if isinstance(data, dict):
        for k, v in data.items():
            new_path = f"{path}.{k}" if path else k
            if new_path not in examples:
                examples[new_path] = []
            if len(examples[new_path]) < 5:
                # Tronca il testo se è troppo lungo
                if isinstance(v, str) and len(v) > 100:
                    v = v[:100] + "..."
                examples[new_path].append(v)
            explore_keys_with_examples(v, new_path, examples)

    elif isinstance(data, list):
        for item in data:
            explore_keys_with_examples(item, path, examples)

    return examples

# Esplora le chiavi e raccogli esempi
examples = explore_keys_with_examples(json_data)

# Stampa i risultati
for key in sorted(examples):
    print(f"{key}: {examples[key]}")
    #print(f"{examples[key]}")


### Count objects with particular fields

In [17]:
count_games = sum(1 for item in json_data)
count_without_companies = sum(1 for item in json_data if 'companies' not in item)
count_without_involved_companies = sum(1 for item in json_data if 'involved_companies' not in item)

x = sum(1 for item in json_data if 'involved_companies' in item and 'companies' not in item)

y = sum(len(item['language_supports']) for item in json_data if 'language_supports' in item)
j = sum(len(item['language_supports']) for item in json_data_raw.values() if 'language_supports' in item)
z = sum(len(item['age_ratings']) for item in json_data_raw.values() if 'age_ratings' in item)

reviews = sum(len(item['reviews']) for item in json_data if 'reviews' in item)

print(f"Numero di giochi: {count_games}")
print(f"Numero di oggetti senza la chiave 'companies': {count_without_companies}")
print(f"Numero di oggetti senza la chiave 'involved_companies': {count_without_involved_companies}")
print(f"Numero di oggetti con la chiave 'involved_companies' ma senza 'companies': {x}")
print(f"Language supports tot: {y}")
print(f"Language supports raw tot: {j}")
print(f"Age ratings raw tot: {z}")

print(f"Numero di reviews: {reviews}")

Numero di giochi: 259290
Numero di oggetti senza la chiave 'companies': 159333
Numero di oggetti senza la chiave 'involved_companies': 259290
Numero di oggetti con la chiave 'involved_companies' ma senza 'companies': 0
Language supports tot: 122872
Language supports raw tot: 811759
Age ratings raw tot: 127784
Numero di reviews: 510061


### Distinct list of elements of a particular field

In [None]:
def extract_unique_elements_from_field(json_data, field_name):
    """
    Estrae elementi unici da un campo specificato in un dataset JSON.

    :param json_data: Il dataset JSON (lista di dizionari).
    :param field_name: Il nome del campo da cui estrarre gli elementi.
    :return: Un set di elementi unici.
    """
    unique_elements = set()

    for item in json_data:
        if field_name in item and isinstance(item[field_name], list):
            unique_elements.update(item[field_name])

    return unique_elements


# Scegli il campo da cui estrarre gli elementi
field_to_extract = "involved_companies"

# Estrai gli elementi unici
unique_elements = extract_unique_elements_from_field(json_data, field_to_extract)

# Stampa i risultati
print(f"{unique_elements}")

### Reviews Extraction

In [20]:
# estrazione delle reviews
import json

def extract_reviews_from_json(json_data):
    """
    Estrae tutte le recensioni univoche da un dataset JSON.

    :param json_data: Il dataset JSON (lista di dizionari).
    :return: Una lista di oggetti recensione univoci.
    """
    reviews = []

    for item in json_data:
        if 'reviews' in item and isinstance(item['reviews'], list):
            for review in item['reviews']:
                # Assumi che ogni recensione sia un dizionario e aggiungi l'ID del gioco
                review_with_id = review  # Fai una copia se non vuoi modificare l'originale
                review_with_id['game_id'] = item['id']  # Aggiungi l'ID del gioco
                reviews.append(review_with_id)

    return reviews

# Estrai le recensioni
extracted_reviews = extract_reviews_from_json(json_data)

# Scrivi le recensioni estratte in un nuovo file JSON
with open("../dataset/reviews.json", 'w', encoding='utf-8') as file:
    json.dump(extracted_reviews, file, ensure_ascii=False, indent=4)


## Content analysis

In [None]:
import pandas as pd

df = pd.json_normalize(json_data)

# Visualizza le prime righe del dataset
print(df.head())

# Informazioni sul dataset
print(df.info())

# Statistiche descrittive
print(df.describe())

# Conteggio valori nulli
print(df.isnull().sum())

### Genres analysis

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

genre_counts = df['genres'].explode().value_counts()
plt.figure(figsize=(10, 6))
sns.barplot(x=genre_counts.index, y=genre_counts.values)
plt.title('Distribuzione dei Generi di Giochi')
plt.xlabel('Genere')
plt.ylabel('Conteggio')
plt.show()

# Analisi delle piattaforme
# Codice simile può essere usato per analizzare le piattaforme