In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from scipy import stats

%load_ext autoreload
%autoreload 2

import utils as plot

## Load Raw Data

In [2]:
data = pd.read_csv('raw_data/census.csv')
gd = data.replace('nan', np.nan)
gd = gd.astype({'season_feedback':str})
for col in ['have_ordered', 'have_playdate', 'dev_playdate', 'use_pulp', 'use_sdk', 
    'aware_of_wiki', 'read_wiki', 'updated_wiki', 'know_other_owners', 'arrival_issues', 'developed_issues']:
    gd.loc[gd[col].notna(), col] = gd[col] == 'Yes'


## Load and Plot Normalized Game Responses

In [None]:
exciting_s1 = pd.read_csv('normalized_short_responses/exciting_s1.csv')
exciting_non_s1 = pd.read_csv('normalized_short_responses/exciting_non_s1.csv').replace('legend etad', 'legend of etad').replace('botanist','the botanist')

fig, _ = plot.horizontal_bar(exciting_s1, title='Most exciting season one game', other_threshold=0, total_responses=gd[season_game_col].notna().sum(), vertical=True)
fig.update_layout(width=1400, bargap=0.1)
fig.update_traces(textfont_size=13)
fig.show()
fig, new_pd = plot.horizontal_bar(exciting_non_s1, title='Most exciting community game', other_threshold=3, total_responses=gd[indie_game_col].notna().sum(), vertical=True)
fig.update_layout(width=1200, bargap=0.1)
fig.update_traces(textfont_size=13)
fig.show()

## Histogram Satisfaction Scores

In [None]:
print((gd.have_playdate & (gd.arrival_issues | gd.developed_issues )).sum())
print(gd.have_playdate.sum())
plot.score_hist(gd[gd.have_playdate == False], 'overall_satisfaction', 'Overall Satisfaction - No Device Yet').show()
plot.score_hist(gd[gd.have_playdate == True], 'overall_satisfaction', 'Overall Satisfaction - Have Device').show()
plot.score_hist(gd, 'satisfied_with_season', 'Season Satisfaction Scores').show()
plot.score_hist(gd[gd.have_playdate == True], 'satisfied_with_games', 'Season Game Satisfaction Scores').show()
plot.score_hist(gd[gd.have_playdate == True], 'build_quality', 'Satisfaction with Build Quality').show()
plot.score_hist(gd, 'satistfied_with_support', 'Satisfaction with Panic Support').show()


## Various Pie Charts

In [None]:
gd.loc[gd.spoiler_free == "I'm not avoiding spoilers", 'spoiler_free'] = 'No'

# plot.pie(gd[gd.have_ordered == True], 'spoiler_free', 'Have Their Device', trace_order=['Yes', 'Mostly','No']).show()
plot.pie(gd, 'spoiler_free', 'Avoiding spoilers', trace_order=['Yes', 'Mostly','No']).show()
plot.pie(gd, 'purchase_season_2').show()
plot.pie(gd, 'known_developer').show()
plot.pie(gd[gd.have_ordered], 'will_order_dock').show()
plot.pie(gd[gd.have_ordered], 'have_ordered_cover').show()

In [None]:
devs = gd[gd.dev_playdate].copy()
def split_by_tool(row):
    if row.use_pulp and row.use_sdk:
        return 'Both'
    if row.use_pulp:
        return 'Pulp'
    if row.use_sdk:
        return 'SDK'
    
    return 'Neither'

devs['tool'] = devs.apply(split_by_tool, axis=1)
plot.pie(devs, 'tool', 'How do you develop for the Playdate?').show()
plot.pie(devs[devs.use_sdk], 'dev_language', 'SDK Users: C or Lua?')

In [None]:
fig = plot.pie(gd[gd.have_ordered & gd.order_group], 'order_group', 'group')
fig.update_layout(width=820, height=820, legend=dict(font_size=20))
fig.update_traces(textfont_size=30 )
fig.show()

## Season 2 Pricing Histograms

In [None]:
# remove outliers
pd = gd[['timestamp','have_playdate','have_ordered','pay_for_12_games','pay_for_24_games', 'have_purchased_game']].copy()
number_cols = ['pay_for_12_games','pay_for_24_games']
pd[number_cols] = pd[pd[number_cols] < 200][number_cols]
pd[number_cols] = pd.where(np.abs(stats.zscore(pd[number_cols], axis=0, nan_policy='omit')) < 3, np.nan)[number_cols]


plot.histogram(pd, 'pay_for_12_games', 'Price (USD) for 12 games').show()
plot.histogram(pd, 'pay_for_24_games', 'Price (USD) for 24 games')

## Load Tagged Free Response Data

In [3]:
target_dir = 'tagged_open_responses'
def import_open_responses(name, ncols=None):
    cols_to_use = range(ncols) if ncols else None
    df= pd.read_csv(f'{target_dir}/raw/{name}.csv', skiprows=[1], usecols=cols_to_use)
    df = df.rename(columns={df.columns[0]:'comment'})
    df = df.dropna(subset='comment').fillna(0)
    df.to_csv(f'{target_dir}/{name}.csv', index=False)
    return df
sdk_requests = import_open_responses('sdk_requests')
sdk_feedback = import_open_responses('sdk_feedback', ncols=8)
pulp_requests = import_open_responses('pulp_requests')
pulp_feedback = import_open_responses('pulp_feedback', ncols=6)
aspect_of_playdate = import_open_responses('aspect_of_playdate', ncols=13)
support_feedback = import_open_responses('support_feedback', ncols=9)
wiki_feedback = import_open_responses('wiki_feedback', ncols=8)
wiki_feedback_pie = import_open_responses('wiki_feedback_pie', ncols=8)
season_delivery = import_open_responses('season_delivery', ncols=10)
season_games = import_open_responses('season_games', ncols=8)
panic_feedback = import_open_responses('panic_feedback', ncols=15)
next_12_months = import_open_responses('next_12_months', ncols=14)


### Test for tags that are too similar

In [None]:
plot.tag_similarity_matrix(next_12_months)

### Wiki Feedback gets a pie since it is fully disjoint

In [None]:
data = wiki_feedback_pie.drop(columns='comment')
data = data.loc[~(data==0).all(axis=1)]
data =data.sum().astype(int).to_frame().reset_index()
data.columns = ['tag', 'num']
fig = plot.pie(data, 'num', horizontal=False, counted=True)
fig.update_layout(width = 450, height=450)
fig.update_traces(textinfo='value+percent')

## Tagged Responses Bubble Chart

In [None]:
fig, dtp, n = plot.bubble_chart(panic_feedback, '')
print(n, 'responses')
fig.show()

## Tagged Responses Horizontal Bar

In [None]:
# fig, _ = plot.horizontal_bar_tags(sdk_requests, 'SDK Feature Requests')
# fig.show()
fig, _ = plot.horizontal_bar_tags(pulp_requests, 'Pulp Feature Requests')
fig.show()
# fig, _ = plot.horizontal_bar_tags(pulp_feedback, 'Feedback for Pulp team')
# fig.show()

## Process text for word cloud

In [None]:
plot.word_cloud_pipeline(gd, 'next_12_months', 0.25, 2, True)
# plot.word_cloud_pipeline(gd, 'season_game_feedback', 0.05, 2, True)


## Marimekko Charts

In [None]:
cols = ['quick_hit_price', 'short_price', 'standard_price', 'long_price', 'highest_price']
top_labels = ['Free', '$1 - $3', '$4 - $6', '$7 - $10', '$11 - 19', '$20 - 29', '$30 & above']
side_labels = ['quick hit or gimmick', 'short / lower quality', 'standard length / quality', 'longer / high quality', 'highest quality']

plot.horizontal_marimekko(gd, cols, top_labels, side_labels)


In [None]:
cols = ['session_10min', 'session_30min', 'session_60min', 'session_hour_plus']
top_labels = ['Never','Rarely', 'Sometimes', 'Often', 'Always']
side_labels = ['1 - 10 mins', '11-30 mins', '31-60 mins', 'Over an hour']
plot.horizontal_marimekko(gd, cols, top_labels, side_labels)

In [None]:
cols = [
'twitter_engagement',
'discord_engagement',
'reddit_engagement',
'itch_engagement',
'forum_engagement',
'youtube_engagement',
'wiki_engagement',
'twitch_engagement'
]
top_labels = ['Never','Once or twice', 'Weekly', 'Couple times a week', 'Daily', 'Many times a day']
side_labels = ['Twitter', 'Discord', 'Reddit', 'Itch', 'Devforum', 'Youtube', 'Wiki', 'Twitch']
plot.horizontal_marimekko(gd, cols, top_labels, side_labels)

## Random math

In [None]:
print('ordered with friends', len(gd[gd.have_ordered & gd.know_other_owners])/gd.have_ordered.sum() * 100)
print('no order but have friends', len(gd[~gd.have_ordered & gd.know_other_owners])/(~gd.have_ordered).sum() * 100)
# aware_of_wiki', 'read_wiki', 'updated_wiki',
print(
    gd[gd.updated_wiki & gd.dev_playdate].timestamp.count() / gd.updated_wiki.sum() * 100
)

## Color Suggestions

In [None]:
colors = pd.read_csv('open_responses/color_suggestions.csv')
colors.loc[~colors.yellow_color.isin(["I don't mind", "I love it"]), 'yellow_color'] = 'Other'
plot.pie(colors, 'yellow_color', 'Color preferences')

In [None]:
color_suggestions = plot.explode_multiple_choice(colors, 'color', delim='|')
fig, _ = plot.horizontal_bar(color_suggestions, 
        title='Color Suggestions', 
        col='color', 
        other_threshold=2
)
fig.update_layout(width=500)
fig.show()

## Demographics Charts

In [None]:
plot.pie(gd, 'age', trace_order=['Under 18', '18-25', '26-35', '36-45', '46+'], horizontal=True).show()
gd.loc[~gd.gender.isin(['Male', 'Female', 'Prefer not to say', 'Non-binary']), 'gender'] = 'Other'
plot.pie(gd, 'gender', horizontal=True).show()
plot.pie(gd, 'continent', horizontal=True)


In [None]:
gd.loc[~gd.desktop_os.isin(['Windows', 'Mac', 'Linux']), 'desktop_os'] = 'Other'
plot.pie(gd, 'desktop_os').show()

gd.loc[~gd.mobile_os.isin(['iOS', 'Android', 'Other']), 'mobile_os'] = 'Other'
plot.pie(gd, 'mobile_os').show()


In [None]:
print('windows / iphone', gd[(gd.desktop_os == 'Windows') & (gd.mobile_os == 'iOS')].timestamp.count() / len(gd.index) * 100)
print('macos / android', gd[(gd.desktop_os == 'Mac') & (gd.mobile_os == 'Android')].timestamp.count() / len(gd.index) * 100)

In [None]:
indie_consoles = pd.read_csv('normalized_short_responses/other_indie_consoles_cleaned.csv')
indie_consoles = indie_consoles[indie_consoles.other_indie_consoles != 'Playdate']
fig, dtp = plot.horizontal_bar(indie_consoles, title='indie consoles', col='other_indie_consoles', total_responses=gd.other_indie_consoles.notna().sum(), other_threshold=4)
fig.show()
print(gd.other_indie_consoles.notna().sum())

In [None]:
consoles = plot.explode_multiple_choice(gd, 'other_consoles', ',')
fig, dtp = plot.horizontal_bar(consoles, title='mainstream consoles', col='other_consoles', total_responses=gd.other_consoles.notna().sum())
fig.show()

In [None]:
content = pd.read_csv('normalized_short_responses/playdate_content_cleaned.csv')
fig, dtp = plot.horizontal_bar(content, title='playdate news and content', col='playdate_content', total_responses=gd.playdate_content.notna().sum(), other_threshold=4, vertical=True)
fig.show()
print(gd.playdate_content.notna().sum())

## Dev tools

In [None]:
content = pd.read_csv('normalized_short_responses/dev_tools_cleaned.csv')
fig, dtp = plot.horizontal_bar(content, title='dev tools', col='dev_tools', total_responses=gd.dev_tools.notna().sum(), other_threshold=1, vertical=False)
fig.show()
print(gd.dev_tools.notna().sum())

In [None]:
content = pd.read_csv('normalized_short_responses/dev_playdate_tools_cleaned.csv')
fig, dtp = plot.horizontal_bar(content, title='dev tools', col='dev_playdate_tools', vertical=True, total_responses=gd.dev_playdate_tools.notna().sum(), other_threshold=1)
fig.show()
print(gd.dev_playdate_tools.notna().sum())

## Game Tags Analysis

In [None]:
gd = gd.replace('Building \(Base, city ect\)', 'City Building', regex=True)
gt = plot.explode_multiple_choice(gd, 'game_tags', delim=',')
non_male = gt[gt.gender != 'Male']
male = gt[gt.gender == 'Male']
fig, data = plot.horizontal_bar(gt, 
        title='Game tags', 
        col='game_tags', 
        total_responses=gd.game_tags.notna().sum(),
        bot_n=None, top_n=20
)
fig.update_layout(
        autosize=True,
        # height=2000
)
fig.show()
# plot.create_wordcloud_format(data)

In [None]:
def tag_filter(label, data_slice):
    total = data_slice.game_tags.notna().sum()
    gt = plot.explode_multiple_choice(data_slice, 'game_tags', delim=',')
    fig, counted = plot.horizontal_bar(gt, title='Game tags', col='game_tags', total_responses=total, bot_n=None, top_n=None)
    counted['rank'] = counted.num.rank(ascending=False)
    old_cols = ['game_tags', 'rank', 'percentage']
    new_cols = ['game_tags', f'{label}_rank', f'{label}_perc']
    counted = counted[old_cols].rename(columns=dict(zip(old_cols, new_cols)))
    return counted

 
def rank_changes(left, right, unit='rank'):
    if unit == 'perc':
        deltas = (combined[f'{left}_{unit}'] / combined[f'{right}_{unit}'] - 1) * 100
    else:
        deltas = combined[f'{left}_{unit}'] - combined[f'{right}_{unit}']
    print(f'Variance: {np.round(deltas.var(),3)}')
    deltas.index = combined.game_tags
    deltas = deltas.sort_values()
    
    return deltas
    
gd['game_tags_size'] = gd.game_tags.apply(lambda x: len(x.split(',')))
young = ('Under 18', '18-25' ) 
middle = ('26-35',)
old = ('36-45', '46+') 

segments = [
    ('all', gd),
    ('nonmale', gd[gd.gender != 'Male']),
    ('female', gd[gd.gender == 'Female']),
    ('nonbinary', gd[gd.gender == 'Non-binary']),
    ('male', gd[gd.gender == 'Male']),
    ('young', gd[gd.age.isin(young)]),
    ('middle', gd[gd.age.isin(middle)]),
    ('old', gd[gd.age.isin(old)]),
    ('na', gd[gd.continent=='North America']),
    ('nonna', gd[gd.continent!='North America']),
    ('iOS', gd[gd.desktop_os=='Mac']),
    ('android', gd[gd.desktop_os=='Windows']),
]


combined = tag_filter(*segments[0])
for segment in segments[1:]:
    new_segment = tag_filter(*segment)
    combined = combined.merge(new_segment, on='game_tags', how='inner')
    print(f'{segment[0]} avg set size {segment[1].game_tags_size.mean()}')
combined


In [None]:
def difference_bar(group1, group2, diffs, exclude_threshold=8):
    filtered = diffs[np.abs(diffs) >= exclude_threshold]

    plotdf = pd.DataFrame({'diffs':filtered}, index=filtered.index)
    plotdf[' '] = plotdf.diffs.apply(lambda x: f'More favored by {group2}' if x > 0 else f'More favored by {group1}')
    fig = px.bar(plotdf, y=plotdf.index, x=plotdf.diffs, barmode='relative', color=' ', orientation='h')
    fig.update_traces(base=0)
    fig.update_layout(height=800, font_size=16, yaxis_showgrid=True, yaxis_gridwidth=3, xaxis_title=f'Rank changes ({group1} ranks minus {group2} ranks)',
        legend=dict(orientation="h", xanchor="center",x=0.5, y=1.1) 
        )
    return fig

# difference_bar('male', 'non-male', rank_changes('male', 'nonmale'), exclude_threshold=7).show()
# difference_bar('non-males', 'males', rank_changes('nonmale', 'male')).show()
difference_bar('players 25 and younger', 'players 26-35', rank_changes('young', 'middle', unit='rank')).show()
difference_bar('players 26-35', 'players 36 and older', rank_changes('middle', 'old', unit='rank')).show()
# difference_bar('players in North America', 'players outside North America', rank_changes('na', 'nonna'), exclude_threshold=7).show()
