In [None]:
import pandas as pd

In [None]:
# load fig_dict
fig_dict = pd.read_pickle('fig_dict.pkl')

In [None]:
# load the pandas dataframe from active_users_df.pkl
active_users_df = pd.read_pickle('active_users_df.pkl')
clusters = active_users_df['cluster']
nc = clusters.max()
active_users_number = active_users_df.shape[0]

In [None]:
# Name clusters based on their characteristics
# This needs to be edited after the clusters have been analyzed

# Create a dictionary to store the cluster names, short and full, and the activity count that is most important for that cluster
cluster_names = {}
cluster_names[1] = ('dir_cont_single', 'One Time Contributors')
cluster_names[2] = ('col_ad_basic', 'Basic Collective Admins')
cluster_names[3] = ('host_ad_basic', 'Basic Host Admins')
cluster_names[4] = ('event_cont', 'Event Participants')
cluster_names[5] = ('col_ad_cont_self', 'Collective Admins Contribute to Own Collectives')
cluster_names[6] = ('org_contr', 'Organization Contributors')
cluster_names[7] = ('exp_sub', 'Expense Submitters')
cluster_names[8] = ('col_ad_cont_via_col', 'Collective Admins with Contributions via Collective')
cluster_names[9] = ('host_ad_high_acti', 'Highly Active Host Admins')
cluster_names[10] = ('col_ad_exp_self', 'Collective Admins Expenses to Own Collectives')
cluster_names[11] = ('host_ad_mod_act', 'Moderately Active Host Admins')
cluster_names[12] = ('dir_contr_repeat', 'Repeated Direct Contribution Activities')
cluster_names[13] = ('host_ad_low_act', 'Low Activity Host Admins')

# Tree Map of cluster sizes

In [None]:
import plotly.express as px

# Create a list of the number of users in each cluster
cluster_sizes = [active_users_df[active_users_df['cluster'] == cluster_num].shape[0] for cluster_num in range(1, nc+1)]

# Create a list of the cluster names
cluster_labels = [cluster_names[cluster_num][1] for cluster_num in range(1, nc+1)]

# Assuming you've set up seaborn, cluster_sizes, and cluster_labels as provided

cluster_data = {
    "Clusters": cluster_labels,
    "Sizes": cluster_sizes
}

df = pd.DataFrame(cluster_data)

fig = px.treemap(df, path=['Clusters'], values='Sizes', title=f'Distribution of Active Users (n={"{:,}".format(active_users_number)}) Into Cohorts')

# Update the tiles to show the label and the value
fig.update_traces(textinfo="label+value", hovertemplate='<b>%{label}</b><br>%{value}')

fig.show()

# add to fig_dict
fig_dict['user_clusters_tree'] = {'fig': fig}

# Radar charts for average user in each cluster

In [None]:
# For each cluster in cluster_name of active_users_df, calculate the mean log norm score of rows in active_users_df for each activity type

clustering_cols = ["expense_activities", "expense_to_own_collective_activities", "host_admin_activities", "collective_admin_activities", "direct_contributions", "collective_contributions", "contributions_to_own_collective", "contributions_via_host", "organization_contributions", "event_orders"]
activity_features = [f + '_norm_log' for f in clustering_cols]

# Create a dataframe to store the median scores
cluster_mean_scores = pd.DataFrame(columns=activity_features)

# For each cluster, calculate the median score for each activity type
for cluster_num in range(1,clusters.max()+1):
    cluster = active_users_df[active_users_df['cluster'] == cluster_num]
    cluster_mean_scores.loc[cluster_num] = cluster[activity_features].mean()

# Add the cluster names to the dataframe
cluster_mean_scores['cluster_name'] = cluster_mean_scores.index.map(lambda x: cluster_names[x][0])
cluster_mean_scores['cluster_full_name'] = cluster_mean_scores.index.map(lambda x: cluster_names[x][1])

# Reorder the columns
cluster_mean_scores = cluster_mean_scores[['cluster_name', 'cluster_full_name'] + activity_features]

In [None]:
import math
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Adjusting subplot titles so that they are full name, line break, and then shorthand name, and then three line breaks
subplot_titles = [f'{row["cluster_full_name"]}<br>{row["cluster_name"]} <br><br> ' for _, row in cluster_mean_scores.iterrows()]

# Calculate rows and columns for subplots based on number of clusters
num_clusters = len(cluster_mean_scores)
num_rows = math.ceil(num_clusters / 2)
num_cols = 2 if num_clusters > 1 else 1  # Use 1 column if there's only one cluster

# Create a subplot grid based on calculated rows and columns
fig = make_subplots(rows=num_rows, cols=num_cols,
                    subplot_titles=subplot_titles,  
                    specs=[[{'type': 'polar'} for _ in range(num_cols)] for _ in range(num_rows)],
                    horizontal_spacing=0.05,
                    vertical_spacing=0.08
                    )

custom_titles = ["Expense<br>to other", "Expense<br>to own", "Host admin", "Collective admin", "Direct<br>contributions", "Collective<br>contributions", "Contribute<br>to own", "Contributions<br>via host", "Organization<br>contributions", "Event Orders"]

for idx, (index, row) in enumerate(cluster_mean_scores.iterrows(), start=1):
    r_idx = (idx - 1) // num_cols + 1
    c_idx = (idx - 1) % num_cols + 1
    
    trace = go.Scatterpolar(
        r=row[2:].tolist(),
        theta=custom_titles,
        fill='toself',
        name=row['cluster_name'],
        showlegend=False
    )
    fig.add_trace(trace, row=r_idx, col=c_idx)

# Update radial axis to not show tick labels and set the range for each subplot
for i in range(1, num_clusters + 1):  
    fig.update_layout({
        f'polar{i}': dict(
            radialaxis=dict(showticklabels=False, range=[0, 0.75])
        )
    })

# Update layout to adjust the size of the figure based on the number of clusters and reduce margins
fig.update_layout(
    height=600 * num_rows,
    width=1000,
    margin=dict(t=200, b=100, r=50, l=50)
)

# Add title to the plot
fig.add_annotation(
        x=0.5,
        y=1.18,
        xref='paper',
        yref='paper',
        text=f'Average Activity Scores by Cluster',
        showarrow=False,
        font=dict(
                family="Courier New, monospace",
                size=40,
                color="black",
                ),
        align="center",
        opacity=1
        )

# Show plot
fig.show()

# export figure as an interactive html file
fig.write_html("cluster_activity_scores.html")


# TSNE dimension reduction to visualize cluster separation

In [None]:
run_tsne = False
from sklearn.manifold import TSNE
import pickle

clustering_features = ["expense_activities_yeojohnson", "expense_to_own_collective_activities_yeojohnson", "host_admin_activities_yeojohnson", "collective_admin_activities_yeojohnson", "direct_contributions_yeojohnson", "collective_contributions_yeojohnson", "contributions_to_own_collective_yeojohnson", "contributions_via_host_yeojohnson", "organization_contributions_yeojohnson", "event_orders_yeojohnson", "host_admin_acivity_modifier_yeojohnson"]

X = active_users_df[clustering_features].values
y = active_users_df['cluster_full_name'].values 

# We want to get TSNE embedding with 2 dimensions
n_components = 2

if run_tsne:
    tsne = TSNE(n_components)
    tsne_result = tsne.fit_transform(X)
    tsne_result_df = pd.DataFrame(tsne_result, columns=['TSNE 1', 'TSNE 2'])
    with open('tsne_result.pkl', 'wb') as f:
        pickle.dump(tsne_result, f)
else:
    tsne_result = pd.read_pickle('tsne_result.pkl')
    tsne_result_df = pd.DataFrame(tsne_result, columns=['TSNE 1', 'TSNE 2'])

# create tsne_result_df with tsne_result and merge with active_users_df on index to add the user_slug, and activity counts for each activity type
tsne_result_df['user_slug'] = active_users_df['user_slug'].values
tsne_result_df['expense_activities'] = active_users_df['expense_activities'].values
tsne_result_df['expense_to_own_collective_activities'] = active_users_df['expense_to_own_collective_activities'].values
tsne_result_df['host_admin_activities'] = active_users_df['host_admin_activities'].values
tsne_result_df['collective_admin_activities'] = active_users_df['collective_admin_activities'].values
tsne_result_df['direct_contributions'] = active_users_df['direct_contributions'].values
tsne_result_df['collective_contributions'] = active_users_df['collective_contributions'].values
tsne_result_df['contributions_to_own_collective'] = active_users_df['contributions_to_own_collective'].values
tsne_result_df['contributions_via_host'] = active_users_df['contributions_via_host'].values
tsne_result_df['organization_contributions'] = active_users_df['organization_contributions'].values
tsne_result_df['event_orders'] = active_users_df['event_orders'].values
# add labels
tsne_result_df['label'] = y

# Create an interactive scatter plot using Plotly
fig = px.scatter(tsne_result_df,
                x='TSNE 1',
                y='TSNE 2',
                color='label', 
                width=900,
                height=900,
                hover_data=[
                    'user_slug',
                    'expense_activities',
                    'expense_to_own_collective_activities',
                    'host_admin_activities',
                    'collective_admin_activities',
                    'direct_contributions',
                    'collective_contributions',
                    'contributions_to_own_collective',
                    'contributions_via_host',
                    'organization_contributions',
                    'event_orders',
                    'label'
                    ],
                )

# Set legend title to Cluster
fig.update_layout(legend_title_text='Cluster')

# Set the same range for x and y axis to maintain the aspect ratio
lim = (tsne_result.min()-20, tsne_result.max()+20)
fig.update_layout(xaxis=dict(range=lim), yaxis=dict(range=lim))

# Set hover template to show user_slug, total_expenses, host_admin_activities, collective_admin_activities, direct_contributions, collective_contributions and label
fig.update_traces(hovertemplate="User: %{customdata[0]}<br>Expense Activities: %{customdata[1]}<br>Expense to Own Collective Activities: %{customdata[2]}<br>Host Admin Activities: %{customdata[3]}<br>Collective Admin Activities: %{customdata[4]}<br>Direct Contributions: %{customdata[5]}<br>Collective Contributions: %{customdata[6]}<br>Contributions to Own Collective: %{customdata[7]}<br>Contributions via Host: %{customdata[8]}<br>Organization Contributions: %{customdata[9]}<br>Event Orders: %{customdata[10]}<br>Cluster: %{customdata[11]}")

fig.update_traces(marker=dict(size=3))

# put legend under the plot
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
    ))

fig.show()

In [None]:
# add tsne_result_df to fig_dict
fig_dict['tsne_result'] = {
    'fig': fig,
    'title': 'Visualization of User Clusters in 2D Space',
    'description': """
        <p>It's hard for humans to visualize or understand anything beyond 3 dimensions. We are trying to understand our users by comparing 10 different metrics. By reducing the data to 2 dimensions, we can see and comprehend patterns in a way our brains can handle, like looking at a map. Think of this as a high-tech magnifying glass. When you look through this magnifying glass at your massive table of users and their 10 activities, it shows you a simpler view: a vast park with people (users) standing around.</p>

        <ul>
            <li>Each user is a person standing in the park.</li>
            <li>Keeping Friends Together: If two users mostly "expense to other" and often "contribute to own", they'd stand close together in the park because they behave similarly.</li>
            <li>Keeping Strangers Apart: If another user mainly does "host admin" tasks and rarely has "direct contributions", they'd be at a different part of the park from the first two.</li>
        </ul>

        <p>Now also imagine that in this park, users wear colored hats based on their dominant activity types:</p>

        <ul>
            <li>Users in green hats mainly do "Expense to other" and "Expense to own".</li>
            <li>Users in yellow hats are heavy on "Direct contributions" and "Collective contributions".</li>
            <li>... (and so on).</li>
        </ul>

        <p>By examining the t-SNE "park view":</p>

        <ul>
            <li>You See Patterns: A cluster of green hats shows you many users with similar expense behaviors.</li>
            <li>Identify Oddities: A green hat standing amongst a sea of yellow hats? That user might be an outlier worth examining.</li>
            <li>Understand User Types: You get a clear visual of the different types of users and their primary activities.</li>
        </ul>

        <p>Note that you will not see every user in this plot because users with exactly the same activity counts will overlap completely. For example, you will not see 60,000+ different users who have only made one contribution as they are all represented by the same point.</p>

    """,
    'info': """"""
}

# Radar charts for users in segments 

In [None]:
# Show the users with the highest number of host admin activities and their cluster_name

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

users_per_cluster = 8

# For each cluster, display the top users_per_cluster users with the highest number of host admin activities

# Display the top 10 users in active_users_df with the highest number of host admin activities and cluster_name host_ad_add_cont
host_ad_high_acti = active_users_df[active_users_df['cluster_name'] == 'host_ad_high_acti'].sort_values(by='host_admin_activities', ascending=False).head(users_per_cluster)

# Display the top 10 users in active_users_df with the highest number of host admin activities and cluster_name host_ad_mod_act
host_ad_mod_act = active_users_df[active_users_df['cluster_name'] == 'host_ad_mod_act'].sort_values(by='host_admin_activities', ascending=False).head(users_per_cluster)

# Display the top 10 users in active_users_df with the highest number of collective admin activities and cluster_name col_ad_basic
col_ad_basic = active_users_df[active_users_df['cluster_name'] == 'col_ad_basic'].sort_values(by='collective_admin_activities', ascending=False).head(users_per_cluster)

# Display the top 10 users in active_users_df with the highest number of collective admin activities and cluster_name col_ad_cont_via_col
col_ad_cont_via_col = active_users_df[active_users_df['cluster_name'] == 'col_ad_cont_via_col'].sort_values(by='collective_admin_activities', ascending=False).head(users_per_cluster)

# Display the top 10 users in active_users_df with the highest number of expense activities and cluster_name exp_sub
exp_sub = active_users_df[active_users_df['cluster_name'] == 'exp_sub'].sort_values(by='expense_activities', ascending=False).head(users_per_cluster)

# Select 10 users randomly from any of these clusters: event_cont, col_ad_cont_self, col_ad_exp_self, host_ad_low_act, dir_contr_repeat, exp_sub
event_cont_random = active_users_df[active_users_df['cluster_name'] == 'event_cont'].sample(users_per_cluster)
col_ad_cont_self_random = active_users_df[active_users_df['cluster_name'] == 'col_ad_cont_self'].sample(users_per_cluster)
col_ad_exp_self_random = active_users_df[active_users_df['cluster_name'] == 'col_ad_exp_self'].sample(users_per_cluster)
host_ad_low_act_random = active_users_df[active_users_df['cluster_name'] == 'host_ad_low_act'].sample(users_per_cluster)
dir_contr_repeat_random = active_users_df[active_users_df['cluster_name'] == 'dir_contr_repeat'].sample(users_per_cluster)
exp_sub_random = active_users_df[active_users_df['cluster_name'] == 'exp_sub'].sample(users_per_cluster)

# Concatenate all the dataframes
top_users_sample = pd.concat([event_cont_random, col_ad_cont_self_random, col_ad_exp_self_random, host_ad_low_act_random, dir_contr_repeat_random, exp_sub_random]).sample(users_per_cluster)

In [None]:
df_title_tuples = [
    (host_ad_high_acti, 'Host admin: High activity', 'host admin activities'),
    (host_ad_mod_act, 'Host admin: Moderate activity', 'host admin activities'),
    (col_ad_basic, 'Collective admin: Basic', 'collective admin activities'),
    (col_ad_cont_via_col, 'Collective admin: Contributions via collective', 'collective admin activities'),
    (exp_sub, 'Expense submitter', 'expense activities'),
    (top_users_sample, 'Random sample of users', 'random')
]

In [None]:
import math
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig_dict['user_profiling'] = []

custom_titles = ["Expense<br>to other", "Expense<br>to own", "Host admin", "Collective admin", "Direct<br>contributions", 
                 "Collective<br>contributions", "Contribute<br>to own", "Contributions<br>via host", "Organization<br>contributions", "Event Orders"]

for top_users_df, title, sortby in df_title_tuples:
    num_users = len(top_users_df)
    num_rows = math.ceil(num_users / 2)
    num_cols = 2 if num_users > 1 else 1
    
    # Use user_slug for subplot titles
    subplot_titles = [str(user_slug) for user_slug in top_users_df['user_slug']]
    
    fig = make_subplots(rows=num_rows, cols=num_cols,
                        specs=[[{'type': 'polar'} for _ in range(num_cols)] for _ in range(num_rows)],
                        horizontal_spacing=0.05,
                        vertical_spacing=0.1,
                        subplot_titles=[title +  '<br> <br> <br>' for title in subplot_titles])

    for _, user in top_users_df.iterrows():
        idx = subplot_titles.index(str(user['user_slug'])) + 1  # Fetch subplot index based on user_slug
        r_idx = (idx - 1) // num_cols + 1
        c_idx = (idx - 1) % num_cols + 1
        
        user_scores = [user[feature] for feature in activity_features]
        
        trace = go.Scatterpolar(
            r=user_scores,
            theta=custom_titles,
            fill='toself',
            name=f"User {idx}",
            showlegend=False
        )
        
        fig.add_trace(trace, row=r_idx, col=c_idx)

    # Update radial axis to not show tick labels and set the range
    for i in range(1, num_users + 1):  
        fig.update_layout({
            f'polar{i}': dict(
                radialaxis=dict(showticklabels=False, range=[0, max([user[feature] for _, user in top_users_df.iterrows() for feature in activity_features]) + 0.05])
            )
        })

    # Update layout to adjust the size of the figure and reduce margins
    fig.update_layout(
        height=400 * num_rows,
        width=1000,
        margin=dict(t=180, b=80, r=50, l=50),
        title=f'{title}, sorted by most {sortby}',
    )
    
    fig.show()

    fig_dict['user_profiling'].append({
        'fig': fig,
        'title': title,
        'description': f"""
            <p>These are the top {users_per_cluster} users in the {title} cluster, sorted by most {sortby}.</p>
        """,
        'info': ", ".join(['<a href="https://opencollective.com/' + user['user_slug'] + '" target="_blank">' + user['user_slug'] + '</a>' for _, user in top_users_df.iterrows()])
    })


In [None]:
# save fig_dict to pkl file
with open('fig_dict.pkl', 'wb') as f:
    pickle.dump(fig_dict, f)