In [1]:
# Standard Libraries
import shutil
import urllib.request as urlrequest
from collections import Counter
from pathlib import Path
from pprint import pprint
from zipfile import ZipFile
import zipfile
from datetime import datetime
from dateutil import parser
import datetime
import ast
import json
# 3rd-party Libraries
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.offline as pyo
import seaborn as sns
#from OGDUtils.general.fileio import FileAPI, FileTypes
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#Retrieve Data Files
## Please change the settings below to configure which game/server location you want to use.


*   game_id : The game whose data you want to access.
*   mode : The kind of data you want to retrieve. Should be one of FileTypes.PLAYER, FileTypes.POPULATION, FileTypes.SESSION, FileTypes.EVENTS or FileTypes.ALL_EVENTS



In [None]:
game_id = 'JOURNALISM'
mode    = FileTypes.PLAYER

In [None]:
months_list = FileAPI.GetAvailableMonths(game_id)
print(f"The available months are:")
pprint(months_list, compact=True)

In [None]:
month = 2
year = 2023

# download the file if it doesn't exist.
# zip_file, dataset_name = FileAPI.DownloadZippedDataset(game_id, month, year, mode)
# zip_name = zip_file.filename

In [None]:
raw_data = pd.DataFrame()

###
#  To use file(s) from website, uncomment the lines in previous cell and delete these.
file_path = 'JOURNALISM_20230401_to_20230425_7e4c492_player-features.zip'
dataset_name = 'JOURNALISM_20230401_to_20230425'
zip_file = ZipFile(file_path)
zip_name = zip_file.filename
###

MAX_ROWS = 10000 # for the sake of not overwhelming RAM, only read first 10,000 rows of a file. Feel free to increase

# Open TSV from the zip file.
tsv_name = f"{zip_name[:zip_name.rfind('.')]}.tsv"
with zip_file.open(f"{dataset_name}/{tsv_name}") as tsv_file:
    raw_data = pd.read_csv(tsv_file, sep='\t', nrows=MAX_ROWS)
    data_readme = zip_file.read(f"{dataset_name}/readme.md")
zip_file.close()

In [2]:
# def read_file(file_path):
#     if file_path.endswith('.zip'):
#         with zipfile.ZipFile(file_path, 'r') as zfile:
#             tsv_files = [f for f in zfile.namelist() if f.endswith('.tsv')]
#             if len(tsv_files) == 0:
#                 raise ValueError("No TSV files found in the zip file.")
#             elif len(tsv_files) > 1:
#                 raise ValueError("Zip file must contain exactly one TSV file.")

#             with zfile.open(tsv_files[0]) as file:
#                 df = pd.read_csv(file, sep='\t')
#     elif file_path.endswith('.tsv'):
#         df = pd.read_csv(file_path, sep='\t')
#     else:
#         raise ValueError("Unsupported file format. Only .zip and .tsv files are supported.")

#     return df


# file_path = 'JOURNALISM_20230401_to_20230425_7e4c492_player-features.zip'  # Replace with your file path
# raw_data = read_file(file_path)


FileNotFoundError: ignored

In [None]:
df_session = raw_data.dropna()
for col in df_session.columns:
    dtype = df_session[col].dtype
    if dtype == bool:
        # convert the boolean column to integer (1 for True and 0 for False)
        df_session[col] = df_session[col].astype(int)

def convert_to_seconds(time_str):
    try:
        time_obj = datetime.datetime.strptime(time_str, '%H:%M:%S.%f').time()
        timedelta_obj = datetime.timedelta(hours=time_obj.hour, minutes=time_obj.minute, seconds=time_obj.second, microseconds=time_obj.microsecond)
        return timedelta_obj.total_seconds()
    except ValueError:
        return None

df_session['UserPlayTime'] = df_session['UserPlayTime'].apply(convert_to_seconds)
df_session['UserPlayTime-Total Time'] = df_session['UserPlayTime-Total Time'].apply(convert_to_seconds)


In [None]:
quit_types = sorted(df_session['QuitType'].unique())
fig, axs = plt.subplots(5, 1, figsize=(6, 20))

bins = 3
x_ticks = np.arange(len(quit_types))
x_tick_labels = quit_types

for i, ax in enumerate(axs):
    fail_level = i + 1
    fail_data = df_session[df_session[f'lvl{fail_level}_FailureCount'] > 0]
    df_column = fail_data['QuitType']
    quit_type_counts = df_column.value_counts().reindex(quit_types, fill_value=0)
    print(quit_type_counts)
    ax.bar(x_ticks, quit_type_counts, align='center')
    ax.set_title(f"Lvl {fail_level} Quit Type")

    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_tick_labels)

fig.tight_layout()
plt.show()

In [None]:
plt.hist(df_session['UserPlayTime'], bins=10)
plt.axvline(x=df_session['UserPlayTime'].mean(), color='red', linestyle='dotted', linewidth=2)
plt.xlabel('Average Player Time')
plt.ylabel('Frequency')
plt.title('Histogram of Average Player Time (seconds)')
plt.show()
print(f'Average time: {df_session["UserPlayTime"].mean() / 60} minutes')

In [None]:
df_completed = df_session[df_session['GameComplete'] == 1]
plt.hist(df_completed['UserPlayTime'], bins=10)
plt.axvline(x=df_completed['UserPlayTime'].mean(), color='red', linestyle='dotted', linewidth=2)
plt.xlabel('Average Player Time (Completed)')
plt.ylabel('Frequency')
plt.title('Histogram of Average Player Time (seconds)')
plt.show()
print(f'Average time: {df_completed["UserPlayTime"].mean() / 60} minutes')

In [None]:
df_played_well = df_session[df_session['TotalFails'] < 50 ]
json_str = df_played_well['SkillSequenceCount-Event Sequence']
parsed_json_object_1 = []
for string in json_str:
    parsed_json = json.loads(string)
    parsed_json_object_1.append(parsed_json)

total_skill = {
    "Research": 0,
    "Resourceful": 0,
    "Endurance": 0,
    "Tech": 0,
    "Social": 0,
    "Trust": 0
}

num_items = 0
for sublist in parsed_json_object_1:
    for obj in sublist:
        skill_json = json.loads(obj["skill"])
        for key, value in skill_json.items():
            total_skill[key] += value
    num_items += 1

average_scores = {key: value / num_items for key, value in total_skill.items()}

labels = list(average_scores.keys())
values = list(average_scores.values())
categories = [*labels, labels[0]]
values = [*values, values[0]]

fig = go.Figure(
    data=[
        go.Scatterpolar(r=values, theta=categories, name='Average Score'),
      ],
    layout=go.Layout(
        title=go.layout.Title(text='Attribute Comparison for Well Played Games'),
        polar={'radialaxis': {'visible': True}},
        showlegend=True
    )
)

print(average_scores)
fig

In [None]:
json_str = df_completed['SkillSequenceCount-Event Sequence']
parsed_json_objects = []
for string in json_str:
    parsed_json = json.loads(string)
    parsed_json_objects.append(parsed_json)

total_skill = {
    "Research": 0,
    "Resourceful": 0,
    "Endurance": 0,
    "Tech": 0,
    "Social": 0,
    "Trust": 0
}

num_items = 0
for sublist in parsed_json_objects:
    for obj in sublist:
        skill_json = json.loads(obj["skill"])
        for key, value in skill_json.items():
            total_skill[key] += value
    num_items += 1

average_scores = {key: value / num_items for key, value in total_skill.items()}

labels = list(average_scores.keys())
values = list(average_scores.values())
categories = [*labels, labels[0]]
values = [*values, values[0]]

fig = go.Figure(
    data=[
        go.Scatterpolar(r=values, theta=categories, name='Average Score'),
      ],
    layout=go.Layout(
        title=go.layout.Title(text='Attribute Comparison for Completed Games'),
        polar={'radialaxis': {'visible': True}},
        showlegend=True
    )
)

print(average_scores)
fig

In [None]:
df_no_finish = df_session[df_session['GameComplete'] == 0]
json_str = df_no_finish['SkillSequenceCount-Event Sequence']
parsed_json_objects = []
for string in json_str:
    parsed_json = json.loads(string)
    parsed_json_objects.append(parsed_json)

total_skill = {
    "Research": 0,
    "Resourceful": 0,
    "Endurance": 0,
    "Tech": 0,
    "Social": 0,
    "Trust": 0
}

num_items = 0
for sublist in parsed_json_objects:
    for obj in sublist:
        skill_json = json.loads(obj["skill"])
        for key, value in skill_json.items():
            total_skill[key] += value
    num_items += 1

average_scores = {key: value / num_items for key, value in total_skill.items()}

labels = list(average_scores.keys())
values = list(average_scores.values())
categories = [*labels, labels[0]]
values = [*values, values[0]]

fig = go.Figure(
    data=[
        go.Scatterpolar(r=values, theta=categories, name='Average Score'),
      ],
    layout=go.Layout(
        title=go.layout.Title(text='Attribute Comparison for Games that were not Completed'),
        polar={'radialaxis': {'visible': True}},
        showlegend=True
    )
)

print(average_scores)
fig


In [None]:
len(df_completed)

In [None]:
attributes = ["Research", "Resourceful", "Endurance", "Tech", "Social", "Trust"]
final_values = []

for sublist in parsed_json_object_1:
    total_skill = {
        "Research": 0,
        "Resourceful": 0,
        "Endurance": 0,
        "Tech": 0,
        "Social": 0,
        "Trust": 0
    }
    for obj in sublist:
        skill_json = json.loads(obj["skill"])
        for key, value in skill_json.items():
            total_skill[key] += value

    final_values.append(list(total_skill.values()))

for i, attribute in enumerate(attributes):
    plt.figure(i)
    plt.hist([values[i] for values in final_values])
    plt.title(attribute)
    plt.xlabel("Value")
    plt.ylabel("Frequency")
    plt.title(f'Frequency of Attribute Scores (players that played well)')


plt.show()

In [None]:
df_failure = df_session[df_session['TotalFails'] < 10]
def bar_graph(df_1, df_2, level):
  plt.figure(figsize=(5, 5))
  plt.bar('Low Attribute', df_1.sum(), label='Low Attribute')
  plt.bar('Out of Time', df_2.sum(), label='Out of Time')
  plt.xlabel('Failure Type')
  plt.ylabel('Count')
  plt.title(f'Comparison of Failure Counts on Level {level}')
  plt.legend()

  plt.show()

bar_graph(df_failure['lvl1_FailureCount-LowAttribute'], df_failure['lvl1_FailureCount-OutOfTime'], 1)
bar_graph(df_failure['lvl2_FailureCount-LowAttribute'], df_failure['lvl2_FailureCount-OutOfTime'], 2)
bar_graph(df_failure['lvl3_FailureCount-LowAttribute'], df_failure['lvl3_FailureCount-OutOfTime'], 3)
bar_graph(df_failure['lvl4_FailureCount-LowAttribute'], df_failure['lvl4_FailureCount-OutOfTime'], 4)
bar_graph(df_failure['lvl5_FailureCount-LowAttribute'], df_failure['lvl5_FailureCount-OutOfTime'], 5)

In [None]:
df_session[df_session['SkillSequenceCount-Event Sequence'] != '[]']

In [None]:
# choose which user you want to look at, change index at the end
json_string = df_session['SkillSequenceCount-Event Sequence'][2]

In [None]:
data = json.loads(json_string)
skill_names = ["Research", "Resourceful", "Endurance", "Tech", "Social", "Trust"]

skill_data = {skill_name: {"time": [], "level": []} for skill_name in skill_names}

cumulative_levels = {skill_name: 0.1 * i for i, skill_name in enumerate(skill_names)}

for item in data:
    time = parser.parse(item["time"])
    skill_values = json.loads(item["skill"])

    for skill_name, level in skill_values.items():
        cumulative_levels[skill_name] += level
        skill_data[skill_name]["time"].append(time)
        skill_data[skill_name]["level"].append(cumulative_levels[skill_name])

plt.figure(figsize=(15, 6))
plt.title("Skill Progression Over Time")
plt.xlabel("Time")
plt.ylabel("Cumulative Skill Level")

for skill_name, skill_values in skill_data.items():
    plt.plot(skill_values["time"], skill_values["level"], label=skill_name)

plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



print(f'Final Scores: {cumulative_levels}')

In [None]:
file_path = 'JOURNALISM_20230601_to_20230630_30c2225_player-features.tsv'
df = pd.read_csv(file_path, sep='\t')
df_completed_2 = df[df['GameComplete'] == True]
df_storyscore = df[['lvl1_StoryScore', 'lvl2_StoryScore', 'lvl3_StoryScore', 'lvl4_StoryScore', 'lvl5_StoryScore', 'lvl6_StoryScore']]
df_storyscore_complete = df_completed_2[['lvl1_StoryScore', 'lvl2_StoryScore', 'lvl3_StoryScore', 'lvl4_StoryScore', 'lvl5_StoryScore', 'lvl6_StoryScore']]

df_storyalignment = df[['lvl1_StoryAlignment',	'lvl2_StoryAlignment', 'lvl3_StoryAlignment',	'lvl4_StoryAlignment',	'lvl5_StoryAlignment',	'lvl6_StoryAlignment']]
df_storyalignment_complete = df_completed_2[['lvl1_StoryAlignment',	'lvl2_StoryAlignment', 'lvl3_StoryAlignment',	'lvl4_StoryAlignment',	'lvl5_StoryAlignment',	'lvl6_StoryAlignment']]

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
axes = axes.flatten()

for i, column in enumerate(df_storyscore.columns):
    axes[i].hist(df[column], bins=20, edgecolor='black')
    axes[i].set_title(column)

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
axes = axes.flatten()

for i, column in enumerate(df_storyscore_complete.columns):
    axes[i].hist(df_storyscore_complete[column], bins=20, edgecolor='black')
    axes[i].set_title(f'{column} Completed')

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
axes = axes.flatten()

for i, column in enumerate(df_storyalignment.columns):
    axes[i].hist(df[column], bins=20, edgecolor='black')
    axes[i].set_title(column)

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
axes = axes.flatten()

for i, column in enumerate(df_storyalignment_complete.columns):
    axes[i].hist(df_storyalignment_complete[column], bins=20, edgecolor='black')
    axes[i].set_title(f'{column} Completed')

plt.tight_layout()
plt.show()

In [None]:
for i in range(1, 7):
    column_name = f'lvl{i}_SnippetsSubmitted'
    df[column_name] = df[column_name].apply(ast.literal_eval)

In [None]:
for i in range(1, 7):
    column_name = f'lvl{i}_SnippetsSubmitted'
    counter = Counter([snippet for snippet_list in df[column_name] for snippet in snippet_list])
    most_common = counter.most_common(10)

    print(f"Most common snippets in {column_name}:")
    for snippet, count in most_common:
        print(f"{snippet}: {count}")
    print()

In [None]:
for i in range(1, 7):
    column_name = f'lvl{i}_LevelCompleted'
    df[column_name] = df[column_name].astype(int)
    counter = df[column_name].value_counts()
    print(counter)

    print()

In [None]:
# amount of people that complete the game
len(df_storyalignment_complete)

In [None]:
target_cols = ['lvl1_LevelCompleted', 'lvl2_LevelCompleted', 'lvl3_LevelCompleted', 'lvl4_LevelCompleted', 'lvl5_LevelCompleted']

# Count the number of 'yes' and 'no' values for each column
counts = df[target_cols].apply(pd.Series.value_counts)
print(counts)
# Create a bar plot
fig, ax = plt.subplots(figsize=(6, 6))
counts.T.plot(kind='bar', ax=ax)
# Modify the legend labels
legend_labels = ['no', 'yes']
ax.legend(labels=legend_labels)

# Add labels and title
ax.set_xlabel('Completed Levels')
ax.set_ylabel('Count')
ax.set_title('Number of Completed Levels')

plt.show()

In [None]:
len(df['SessionCount'].unique())
session_df = df[df['SessionCount'] > 1]
equal_one = df[df['SessionCount'] == 1]
print(f'{len(equal_one)} out of {len(df)} players had only one session')

plt.hist(session_df['SessionCount'], bins=12, edgecolor='black')
plt.xlabel('Number of Sessions')
plt.ylabel('Frequency')
plt.title('Histogram of Player Sessions (Sessions > 1)')
plt.show()

In [None]:
len(df_completed_2['SessionCount'].unique())
session_df = df_completed_2[df_completed_2['SessionCount'] > 1]
equal_one = df_completed_2[df_completed_2['SessionCount'] == 1]
print(f'{len(equal_one)} out of {len(df_completed_2)} players had only one session')

plt.hist(df_completed_2['SessionCount'], bins=12, edgecolor='black')
plt.xlabel('Number of Sessions')
plt.ylabel('Frequency')
plt.title('Histogram of Player Sessions (Sessions > 1)')
plt.show()


In [None]:
df_lvl1 = df[df['lvl1_LevelCompleted'] == 1]
df_lvl2 = df[df['lvl2_LevelCompleted'] == 1]
df_lvl3 = df[df['lvl3_LevelCompleted'] == 1]
df_lvl4 = df[df['lvl4_LevelCompleted'] == 1]
df_lvl5 = df[df['lvl5_LevelCompleted'] == 1]

In [None]:
def calculate_counts(data_frame, attribute_column):
    attribute_counts = {}
    for group in data_frame[attribute_column]:
        group = ast.literal_eval(group)
        for value in group:
            if value in attribute_counts:
                attribute_counts[value] += 1
            else:
                attribute_counts[value] = 1
    return attribute_counts

data_frames = {
    "Completed Level 1": df_lvl1,
    "Completed Level 2": df_lvl2,
    "Completed Level 3": df_lvl3,
    "Completed Level 4": df_lvl4,
    "Completed Level 5": df_lvl5,
    "Finished the Game": df_completed
}

for name, data_frame in data_frames.items():
    top_counts = calculate_counts(data_frame, "TopAttribute-Names")
    worst_counts = calculate_counts(data_frame, "WorstAttribute-Names")

    sorted_keys = sorted(top_counts.keys())
    sorted_dict1 = {key: top_counts[key] for key in sorted_keys}
    sorted_dict2 = {key: worst_counts[key] for key in sorted_keys}
    x_label = sorted_dict1.keys()
    worst_values = list(sorted_dict2.values())
    top_values = list(sorted_dict1.values())

    plt.figure()
    plt.bar(x_label, top_values, color='r', label='Worst Values')
    plt.bar(x_label, worst_values, bottom=top_values, color='b', label='Top Values')

    plt.xlabel('Attributes')
    plt.title('Stacked Attributes - ' + name)
    plt.legend()

In [None]:
json_str = df_completed_2['SkillSequenceCount-Event Sequence']
parsed_json_objects = []
for string in json_str:
    parsed_json = json.loads(string)
    parsed_json_objects.append(parsed_json)

total_skill = {
    "Research": 0,
    "Resourceful": 0,
    "Endurance": 0,
    "Tech": 0,
    "Social": 0,
    "Trust": 0
}

num_items = 0
for sublist in parsed_json_objects:
    for obj in sublist:
        skill_json = json.loads(obj["skill"])
        for key, value in skill_json.items():
            total_skill[key] += value
    num_items += 1

average_scores = {key: value / num_items for key, value in total_skill.items()}

labels = list(average_scores.keys())
values = list(average_scores.values())
categories = [*labels, labels[0]]
values = [*values, values[0]]

fig = go.Figure(
    data=[
        go.Scatterpolar(r=values, theta=categories, name='Average Score'),
      ],
    layout=go.Layout(
        title=go.layout.Title(text='Attribute Comparison for Completed Games'),
        polar={'radialaxis': {'visible': True}},
        showlegend=True
    )
)

print(average_scores)
fig