In [39]:
import pandas as pd
import plotly.offline as offline
import pickle

In [40]:
# load fig_dict
fig_dict = pd.read_pickle('fig_dict.pkl')

In [41]:
# HTML for user activity counts
fig = fig_dict['active_users']['fig']
fig.layout.height = 500
div = offline.plot(fig, include_plotlyjs=False, output_type='div', show_link=False)
active_users_html_div = [f"""
    <div class='chart-container'>
                         
        <h1 id="active-users">Active Users</h1>
        
        <p>In this section we start by looking at the number of users on the platform in the last 12 months. There are two main definitions of "user" in this report:</p>

        <p><b>All Users:</b> Anyone who's done something on our platform, even just logged in, is included in this group.</p>    

        <p><b>Users with Counted Activities:</b> A subset of users who have performed at least 1 of a number of activities in the last 12 months. These are specific actions users do, grouped into different types:</p>

        <ul>
            <li><b>Expense Activities:</b> This includes creating expenses (collective.expense.created) and using virtual cards (virtualcard.purchase).</li>
            <li><b>Host Admin Activities:</b> These are tasks by host admins. They cover a range from approving a collective to managing virtual cards and paying expenses.</li>
            <li><b>Collective Admin Activities:</b> These are actions by collective admins like applying for collective status or editing members.</li>
            <li><b>Contributor Activities:</b> We don't pull these from the activity log. Instead, we focus on user actions, like making or updating an order. We look at user-updated orders marked as "PAID", "ACTIVE", or "CANCELLED".</li>
        </ul>

        <p>Some activities aren't included because they don't tie to a specific user group, like rejecting an expense or commenting on one.</p>
                         
        <p>In the rest of this report we will only be considering the "Users with Counted Activities" group.</p>

        <h2 id="{fig_dict['active_users']['title'].replace(' ', '-').lower()}">{fig_dict['active_users']['title']}</h2>
        <p>{fig_dict['active_users']['description']}</p>
        {div}
        <details>
            <summary>More info</summary>
            <div class='info-div-container'>{fig_dict['active_users']['info']}</div>
        </details>
    </div>
    """]

# HTML for activity counts
activity_count_header = """
<h1 id="user-activity">User Activity</h1>
<p>Here we look at the number of users who have performed these counted activities in the last 12 months. In the pie charts below we show the fraction of users who have performed each activity as a percentage of all users who have performed any "counted" activity. In the bar chart "histograms" we count the number of users who have performed the activity X number of times.</p>
<p>Total Expenses is an aggregate of "Expense Activities", "Expense To Own Collective Activities" and "Virtual Card Purchases". Total Contributions is an aggregate of all contributor activities.</p>
"""
activity_counts_html_divs = [activity_count_header]
activity_count_titles = []
for i, a in enumerate(fig_dict['activity_counts']):
    fig = a['fig']
    fig.layout.height = 500
    div = offline.plot(fig, include_plotlyjs=False, output_type='div', show_link=False, image_height=400)
    activity_counts_html_divs.append(f"""
    <div class='chart-container'>
        <h2 id="{a['title'].replace(' ', '-').lower()}">{a['title']}</h2>
        <p>{a['description']}</p>
        {div}
        <details>
            <summary>▸ More info</summary>
            <div class='info-div-container'>{a['info']}</div>
        </details>
    </div>
    """)
    activity_count_titles.append(a['title'])

# HTML for user clusters
user_clusters_header = """
<h1 id="user-clusters">User Clusters</h1>
<p>Next, we group our users based on their typical activities on the platform. By doing this, we gain a clearer understanding of the diverse user categories we cater to on our platform.</p>

<p>For instance, our data indicates that nearly 9% of users regularly submit expenses, while about 2% act on behalf of organizations. But, does a set of users engage in both activities? If so, is this subset significant? Manually exploring every potential activity combination would be cumbersome. This is where clustering proves invaluable. Clustering groups users by similarities without us making excessive assumptions. The aim is to discern patterns and identify distinct user types.</p>

<p>From our analysis, 13 user groups emerged via clustering, each labeled by their defining characteristics. Delve into each cluster to see how users score on various activity metrics. A score of 1 in the "Collective admin" metric means that particular user tops the chart in collective admin activities platform-wide. Notably, these scores are logarithmic, ranging from 0 to 1. This scale enables us to equitably compare highly active users with their less active counterparts. For instance, a collective admin who approves 19 expenses annually might have a score of 0.47, while one that approves 190 expenses could score 0.85. Without such a scoring system, graphs would be less insightful due to the vast activity gap between the most active and the occasional users.</p>
"""
user_cluster_treemap = fig_dict['user_clusters_tree']
user_cluster_treemap.layout.height = 600
user_clusters_treemap = offline.plot(user_cluster_treemap, include_plotlyjs=False, output_type='div', show_link=False, image_height=500)
user_clusters_html_divs = [user_clusters_header, user_clusters_treemap]
user_cluster_titles = []
for i, a in enumerate(fig_dict['user_clusters']):
    fig = a['fig']
    fig.layout.height = 600
    div = offline.plot(fig, include_plotlyjs=False, output_type='div', show_link=False, image_height=400)
    user_clusters_html_divs.append(f"""
    <div class='chart-container'>
        <h2 id="{a['title'].replace(' ', '-').lower()}">{a['title']}</h2>
        <p>{a['description']}</p>
        {div}
    </div>
    """)
    user_cluster_titles.append(a['title'])

# HTML for TSNE plot
tsne_plot = fig_dict['tsne_result']
fig = tsne_plot['fig']
fig.layout.height = 800
tsne_plot_div = offline.plot(fig, include_plotlyjs=False, output_type='div', show_link=False, image_height=800)
tsne_html = f"""
<div class='chart-container'>
    <h2 id="user-clusters-tsne">User Clusters: TSNE Plot</h2>
    <p>Here we show a plot of the user clusters. The plot is interactive, so you can zoom in and hover over the points to see the user cluster labels.</p>
    <p>{tsne_plot['description']}</p>
    {tsne_plot_div}
</div>
"""

# HTML for user profiles
user_profiles_header = """
<h1 id="user-profiles">User Profiles</h1>
<p>Here we look at the user profiles for each of the user clusters. We show the top 8 users in some selected clusters, based on the cluster's defining characteristics. This helps us get a grasp of who the users are.</p>
"""
user_profiles_html_divs = [user_profiles_header]
user_profiles_titles = []
for i, a in enumerate(fig_dict['user_profiling']):
    fig = a['fig']
    fig.layout.height = 1400
    div = offline.plot(fig, include_plotlyjs=False, output_type='div', show_link=False, image_height=400)
    user_profiles_html_divs.append(f"""
    <div class='chart-container'>
        <h2 id="{a['title'].replace(' ', '-').lower()}">{a['title']}</h2>
        <p>{a['description']}</p>
        {div}
        <details>
            <summary>▸ Links to user profiles on platform</summary>
            <div class='info-div-container'>{a['info']}</div>
        </details>
    </div>
    """)
    user_profiles_titles.append(a['title'])

# HTML for expense amount and count by host and collective
expense_amount_by_host = fig_dict['treemaps']['expense_amount_by_host_and_collective']
expense_count_by_host = fig_dict['treemaps']['expense_count_by_host_and_collective']
fig1 = expense_amount_by_host['fig']
fig1.layout.height = 500
div1 = offline.plot(fig1, include_plotlyjs=False, output_type='div', show_link=False, image_height=400)
fig2 = expense_count_by_host['fig']
fig2.layout.height = 500
div2 = offline.plot(fig2, include_plotlyjs=False, output_type='div', show_link=False, image_height=400)
collective_analysis_header = """
<h1 id="collective-analysis">Collective Analysis</h1>
<p>Next we zoom in on collectives and their expenses and income. We will pay particular attention to relationships between money flow, host fee and platform tips.</p>

<p>To create the graphs belew we needed to preprocess the data. In lay terms, think of this preprocessing as cleaning and organizing a messy room. Unnecessary items (like zero amounts or spammers) were thrown out, similar items (like transactions of the same collective) were grouped together, everything was labeled in a consistent manner, and some items (like internal transactions) were kept aside as they weren't relevant to the main analysis. The end goal is to have a clean and organized dataset that's easier to work with and understand.</p>

<p>We explain in short here what steps were taken:</p>

<ul>
    <li>For both expenses and income data, if the transaction wasn't already in USD, it was converted using a fixed exchange rate. This standardizes the currency across all transactions, making comparisons and aggregations straightforward.</li>
    <li>Any collective in the expenses data that had fewer than 5 transactions and was its own host was removed. Additionally, any collectives in the income data matching these dropped collectives were also removed. This was done to remove noise of people 'trying out' the platform and adding contributions and expenses to test hosts and collectives.</li>
    <li>In the income data, if the total number of transactions for a collective was less than 5, and if all payment methods for that collective were of type 'host', then these rows were removed. This is done likely to remove any noise or irrelevant data.</li>
    <li>Rows in both expenses and income datasets having a specific pattern in their slug (ending with <string>-<number>.<number>) were removed. This is because this pattern was identified as belonging to spammers.</li>
    <li>Any transactions in both datasets where the host collective was 'opencollective' were removed. This filters out internal payments to the OC Inc team.</li>
</ul>

<p>This resulted in a dataset of all expenses and contributions to all real collectives on the platform for the last 12 months.</p>
"""
expense_amount_and_count_by_host_html = f"""
<div class='chart-container'>
    <h2 id="collective-expenses">Collective Expenses</h2>

    <p>In these first two graphs we look at the total amount and count of expenses by host and collective. We look at all hosts and collectives in a chart where the size of the square is proportional to its value.</p>
    {div1}
    
    {div2}
</div>
"""
collective_analysis_html_divs = [collective_analysis_header, expense_amount_and_count_by_host_html]

# HTML for expense amount and count by host fee
expense_amount_by_host_fee = fig_dict['treemaps']['expense_amount_by_host_fee']
expense_count_by_host_fee = fig_dict['treemaps']['expense_count_by_host_fee']
fig1 = expense_amount_by_host_fee['fig']
fig1.layout.height = 500
div1 = offline.plot(fig1, include_plotlyjs=False, output_type='div', show_link=False, image_height=400)
fig2 = expense_count_by_host_fee['fig']
fig2.layout.height = 500
div2 = offline.plot(fig2, include_plotlyjs=False, output_type='div', show_link=False, image_height=400)
expense_amount_and_count_by_host_fee_html = f"""
<div class='chart-container'>
    <h2 id="collective-expenses-by-hostfee">Expenses by Host Fee</h2>
    <p>Here we look at the total amount and count of expenses by host fee. We look at all hosts and collective in a chart where the size of the square is proportional to its value.</p>
    <p>Notice that in both cases, the zero-host-fee category is second largest by both counts and amount.</p>
    {div1}

    {div2}
</div>
"""

# HTML for income by host and method
income_by_host_and_method = fig_dict['treemaps']['income_amount_by_host_percentage_and_method']
fig = income_by_host_and_method['fig']
fig.layout.height = 500
div = offline.plot(fig, include_plotlyjs=False, output_type='div', show_link=False, image_height=400)
income_by_host_and_method_html = f"""
<div class='chart-container'>
    <h2 id="collective-income-by-host-and-method">Tip Potential: Income by Host and Method</h2>
    <p>How large is the tip potential for each host? Here we look at the total amount of income by host and method. Tips are only possible for contributions, and are unlikely when organizations make larger donations via bank transfer.</p>
    <p>In this chart we categorize the income by host and method. We then color the methods according to their potential for tipping. By doing so we are investiging, particularly for the zero-host-fee hosts, what the potential for tipping is when all contributions are considered.</p>

    {div}
</div>
"""

# HTML for income from tips by host
tip_amount_amount_by_host_fee_and_host_and_collective = fig_dict['treemaps']['tip_amount_amount_by_host_fee_and_host_and_collective']
fig = tip_amount_amount_by_host_fee_and_host_and_collective['fig']
fig.layout.height = 500
div = offline.plot(fig, include_plotlyjs=False, output_type='div', show_link=False, image_height=400)
tip_amount_amount_by_host_fee_and_host_and_collective_html = f"""
<div class='chart-container'>
    <h2 id="collective-tips">Tip Income: Tip Income by Host and Collective</h2>
    <p>Finally, we look at the total amount of tips by host and collective. We color the squares by if the collective is on a host with a host fee or not.</p>

    {div}
</div>
"""

# Table of contents HTML
toc_html = f"""
<div class="side-toc">
    <h2>Contents</h2>

    <h3><a href="#active-users">Active Users</a></h3>
    <ul>
        {''.join(f'<li><a href="#{fig_dict["active_users"]["title"].replace(" ", "-").lower()}">{fig_dict["active_users"]["title"]}</a></li>')}
    </ul>

    <h3><a href="#user-activity">User Activity</a></h3>
    <ul>
        {''.join([f'<li><a href="#{title.replace(" ", "-").lower()}">{title}</a></li>' for title in activity_count_titles])}
    </ul>

    <h3><a href="#user-clusters">User Clusters</a></h3>
    <ul>
        {''.join([f'<li><a href="#{title.replace(" ", "-").lower()}">{title}</a></li>' for title in user_cluster_titles])}
    </ul>

    <h3><a href="#user-clusters-tsne">User Clusters: TSNE Plot</a></h3>

    <h3><a href="#user-profiles">User Profiles</a></h3>
    <ul>
        {''.join([f'<li><a href="#{title.replace(" ", "-").lower()}">{title}</a></li>' for title in user_profiles_titles])}
    </ul>

    <h3><a href="#collective-analysis">Collective Analysis</a></h3>
    <ul>
        <li><a href="#collective-expenses">Collective Expenses</a></li>
        <li><a href="#collective-expenses-by-hostfee">Expenses by Host Fee</a></li>
        <li><a href="#collective-income-by-host-and-method">Tip Potential: Income by Host and Method</a></li>
        <li><a href="#collective-tips">Tip Income: Tip Income by Host and Collective</a></li>
    </ul>

</div>
"""

# Update the CSS styling for TOC and font
css_styles = """
<style>
    @import url('https://fonts.googleapis.com/css2?family=Roboto:wght@400;500&display=swap');
    
    body {
        font-family: 'Roboto', sans-serif;
        /* ... [rest of your body styles] */
    }

    .toc-container {
        background-color: #fff;
        border-radius: 8px;
        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
        margin: 20px auto;
        padding: 20px;
        max-width: 90%;
    }

    .toc-container ul {
        padding-left: 20px;
    }

    .toc-container li {
        margin-bottom: 10px;
    }

    .toc-container a {
        text-decoration: none;
        color: #000;
    }

    .toc-container a:hover {
        text-decoration: underline;
    }

    /* Side Table of Contents */
    .side-toc {
        position: fixed;
        top: 20px;
        left: 20px;
        width: 250px;
        background-color: #fff;
        border-radius: 8px;
        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
        padding: 20px;
        max-height: 80vh;
        overflow-y: auto;
    }

    /* Main content styling */
    h1 {
        padding: 40px 0 10px 0;
        margin: 0;
    }

    h2 {
        margin: 70px 0 0 0;
    }

    body > p {  /* Introduction paragraph */
        text-align: justify;
        padding: 0 15% 40px 15%; 
        max-width: 70%;
        margin: 0 auto;
    }

    details {
        border-top: 1px solid #ddd;
        margin-top: 10px;
    }

    summary {
        cursor: pointer;
        color: #000;
    }

    summary:hover {
        text-decoration: underline;
    }

    .info-div-container {
        margin-top: 10px;
        padding-top: 10px;
    }
</style>
"""

# Modify the HTML parts
final_html = f"""
<html>
    <head>
        <title>Retreat Report: Users, Hosts and Collectives</title>
        <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
        {css_styles}
    </head>
    <body>
        <div style="margin-left: 350px;">
            <h1>Retreat Report: Users, Hosts and Collectives</h1>
            <p>This report was written in September 2023 ahead of the Open Collective team retreat in Bath. Its author is Hugi Asgeirsson. <a href="https://github.com/opencollective/profile-report" target="_blank">Source code to generate this report is available here.</a></p>
        </div>
        {toc_html}
        <div style="margin: 0 70px 0 350px;">
            {''.join(active_users_html_div)}
            {''.join(activity_counts_html_divs)}
            {''.join(user_clusters_html_divs)}
            {tsne_html}
            {''.join(user_profiles_html_divs)}
            {''.join(collective_analysis_html_divs)}
            {expense_amount_and_count_by_host_fee_html}
            {income_by_host_and_method_html}
            {tip_amount_amount_by_host_fee_and_host_and_collective_html}
        </div>
    </body>
</html>
"""

# Write the final HTML to a file
with open("retreat_report.html", "w") as f:
    f.write(final_html)