In [None]:
import pandas as pd
import plotly.offline as offline
import pickle

In [None]:
# load fig_dict
fig_dict = pd.read_pickle('fig_dict.pkl')

In [None]:
def set_fig_height(fig, height):
    fig.layout.height = height
    return fig

def get_html_section(data, include_figs=False, header_level='h1', fig_height=500):
    if include_figs and data['figs']:
        if not isinstance(data['figs'], list):
            data['figs'] = [data['figs']]
        figs = [offline.plot(set_fig_height(fig['fig'], fig_height), include_plotlyjs=False, output_type='div', show_link=False) for fig in data['figs']]
        figs = ''.join(figs)
        return f"""
        <div class='chart-container'>
            <{header_level} id="{data['id']}">{data['title']}</{header_level}>
            <p>{data['body']}</p>
            {figs}
        </div>
        """
    else:
        return f"""
        <div class='chart-container'>
            <{header_level} id="{data['id']}">{data['title']}</{header_level}>
            <p>{data['body']}</p>
        </div>
        """

def get_html_figs_section(figs, fig_info, header_level='h1', fig_height=500):
    if not isinstance(figs, list):
        figs = [figs]
    
    fig_divs = []
    fig_titles = []

    for a in figs:
        fig = a['fig']
        fig.layout.height = fig_height
        div = offline.plot(fig, include_plotlyjs=False, output_type='div', show_link=False)
        fig_divs.append(f"""
        <div class='chart-container'>
            <h2 id="{a['title'].replace(' ', '-').replace(':','-').lower()}">{a['title']}</h2>
            <p>{a['description']}</p>
            {div}
        </div>
        """)
        fig_titles.append(a['title'])

    all_divs = ''.join(fig_divs)
    
    # Generate the details section if the 'info' key exists and is not blank

    title_section = f"""
    <{header_level} id="{fig_info['title'].replace(' ', '-').replace(':','-').lower()}">{fig_info['title']}</{header_level}>
    """ if fig_info.get('title') else ""

    description_section = f"""
    <p>{fig_info['description']}</p>
    """ if fig_info.get('description') else ""

    details_section = f"""
    <details>
        <summary>▸ More info</summary>
        <div class='info-div-container'>{fig_info.get('info')}</div>
    </details>
    """ if fig_info.get('info') else ""

    return f"""
    <div class='chart-container'>
        {title_section}
        {description_section}
        {all_divs}
        {details_section}
    </div>
    """

def generate_toc(header_sections):
    toc_items = []
    
    for _, section_data in header_sections.items():
        # Add the main section (h1) to TOC
        toc_items.append(f'<li><a href="#{section_data["id"]}">{section_data["title"]}</a></li>')
        
        # Add subsections (h2) to TOC if they exist
        if 'figs' in section_data and isinstance(section_data['figs'], list) and 'title' in section_data['figs'][0].keys():
            toc_items.append('<ul>')
            for fig in section_data['figs']:
                if 'title' in fig.keys():
                    fig_title = fig['title'].replace(' ', '-').replace(':','-').lower()
                    toc_items.append(f'<li><a href="#{fig_title}">{fig["title"]}</a></li>')
            toc_items.append('</ul>')

    return f"""
    <div class="side-toc">
        <h3>Table of Contents</h3>
        <ul>{''.join(toc_items)}</ul>
    </div>
    """


In [None]:
# Update the CSS styling for TOC and font
css_styles = """
<style>
    @import url('https://fonts.googleapis.com/css2?family=Roboto:wght@400;500&display=swap');

    body {
        font-family: 'Roboto', sans-serif;
        font-size: 16px; /* A baseline font size */
        line-height: 1.6;
        color: #333; /* A slightly softer black for the text */
        background-color: #f5f5f5; /* A light gray background to contrast with white cards */
        margin: 0;
        padding: 0;
    }

    h1 {
        font-size: 2.5em;
        margin-bottom: 0.5em;
    }

    h2 {
        font-size: 2em;
        margin-bottom: 0.5em;
    }

    h3 {
        font-size: 1.5em;
        margin-bottom: 0.5em;
    }

    p {
        margin-bottom: 1em;
    }

    ul, ol {
        margin-bottom: 1em;
        padding-left: 2em;
    }

    .toc-container, .side-toc, .chart-container {
        background-color: #fff;
        border-radius: 8px;
        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
        padding: 20px;
        margin-bottom: 2em;
    }

    .side-toc {
        position: fixed;
        top: 20px;
        left: 20px;
        width: 250px;
        max-height: 80vh;
        overflow-y: auto;
    }

    .toc-container a, .side-toc a {
        color: #0077cc; /* A neutral link color */
        text-decoration: none;
    }

    .toc-container ul, .side-toc ul {
        margin-bottom: 0;
    }

    .toc-container ul li, .side-toc ul li {
        margin-top: 1em;
    }

    .toc-container ul ul li, .side-toc ul ul li {
        margin-top: 0;
    }

    .toc-container a:hover, .side-toc a:hover {
        text-decoration: underline;
    }

    details {
        margin-top: 20px;
        cursor: pointer;
    }

    summary {
        color: #0077cc;
    }

    summary:hover {
        text-decoration: underline;
    }

    .info-div-container {
        margin-top: 10px;
    }
</style>

"""

In [None]:
sections = {
    'intro': {
        'id': 'intro',
        'title': 'Retreat Report: Users, Hosts and Collectives',
        'body': """
            <p>This report was written in September 2023 ahead of the Open Collective team retreat in Bath. Its author is Hugi Asgeirsson. <a href="https://github.com/opencollective/profile-report" target="_blank">Source code to generate this report is available here.</a></p>
        """
    },
    'active_users': {
        'id': 'active-users',
        'title': 'Active Users',
        'figs': fig_dict['active_users'],
        'body': """
            <p>In this section we start by looking at the number of users on the platform in the last 12 months. There are two main definitions of "user" in this report:</p>

            <p><b>All Users:</b> Anyone who's done something on our platform, even just logged in, is included in this group.</p>    

            <p><b>Users with Counted Activities:</b> A subset of users who have performed at least 1 of a number of activities in the last 12 months. These are specific actions users do, grouped into different types:</p>

            <ul>
                <li><b>Expense Activities:</b> This includes creating expenses (collective.expense.created) and using virtual cards (virtualcard.purchase).</li>
                <li><b>Host Admin Activities:</b> These are tasks by host admins. They cover a range from approving a collective to managing virtual cards and paying expenses.</li>
                <li><b>Collective Admin Activities:</b> These are actions by collective admins like applying for collective status or editing members.</li>
                <li><b>Contributor Activities:</b> We don't pull these from the activity log. Instead, we focus on user actions, like making or updating an order. We look at user-updated orders marked as "PAID", "ACTIVE", or "CANCELLED".</li>
            </ul>

            <p>Some activities aren't included because they don't tie to a specific user group, like rejecting an expense or commenting on one.</p>
                            
            <p>In the rest of this report we will only be considering the "Users with Counted Activities" group.</p>

        """
    },
    'activity_counts': {
        'id': 'user-activity',
        'title': 'User Activity',
        'figs': fig_dict['activity_counts'],
        'body': """
            <p>Here we look at the number of users who have performed these counted activities in the last 12 months. In the pie charts below we show the fraction of users who have performed each activity as a percentage of all users who have performed any "counted" activity. In the bar chart "histograms" we count the number of users who have performed the activity X number of times.</p>
            <p>Total Expenses is an aggregate of "Expense Activities", "Expense To Own Collective Activities" and "Virtual Card Purchases". Total Contributions is an aggregate of all contributor activities.</p>
        """
    },
    'user_clusters': {
        'id': 'user-clusters',
        'title': 'User Clusters',
        'figs': fig_dict['user_clusters'],
        'body': """
            <p>Next, we group our users based on their typical activities on the platform. By doing this, we gain a clearer understanding of the diverse user categories we cater to on our platform.</p>

            <p>For instance, our data indicates that nearly 9% of users regularly submit expenses, while about 2% act on behalf of organizations. But, does a set of users engage in both activities? If so, is this subset significant? Manually exploring every potential activity combination would be cumbersome. This is where clustering proves invaluable. Clustering groups users by similarities without us making excessive assumptions. The aim is to discern patterns and identify distinct user types.</p>

            <p>From our analysis, 13 user groups emerged via clustering, each labeled by their defining characteristics. Delve into each cluster to see how users score on various activity metrics. A score of 1 in the "Collective admin" metric means that particular user tops the chart in collective admin activities platform-wide. Notably, these scores are logarithmic, ranging from 0 to 1. This scale enables us to equitably compare highly active users with their less active counterparts. For instance, a collective admin who approves 19 expenses annually might have a score of 0.47, while one that approves 190 expenses could score 0.85. Without such a scoring system, graphs would be less insightful due to the vast activity gap between the most active and the occasional users.</p>
        """
    },
    'user_clusters_tsne': {
        'id': 'user-clusters--tsne-plot',
        'title': 'User Clusters: TSNE Plot',
        'figs': fig_dict['tsne_result'],
        'body': """
            <p>Here we show a plot of the user clusters. The plot is interactive, so you can zoom in and hover over the points to see the user cluster labels.</p>
        """
    },
    'user_profiles': {
        'id': 'user-profiles',
        'title': 'User Profiles',
        'figs': fig_dict['user_profiling'],
        'body': """
            <p>Here we look at the user profiles for each of the user clusters. We show the top 8 users in some selected clusters, based on the cluster's defining characteristics. This helps us get a grasp of who the users are.</p>
        """
    },
    'collective_analysis': {
        'id': 'collective-analysis',
        'title': 'Collective Analysis',
        'body': """
            <p>Next we zoom in on collectives and their expenses and income. We will pay particular attention to relationships between money flow, host fee and platform tips.</p>

            <p>To create the graphs belew we needed to preprocess the data. In lay terms, think of this preprocessing as cleaning and organizing a messy room. Unnecessary items (like zero amounts or spammers) were thrown out, similar items (like transactions of the same collective) were grouped together, everything was labeled in a consistent manner, and some items (like internal transactions) were kept aside as they weren't relevant to the main analysis. The end goal is to have a clean and organized dataset that's easier to work with and understand.</p>

            <p>We explain in short here what steps were taken:</p>

            <ul>
                <li>For both expenses and income data, if the transaction wasn't already in USD, it was converted using a fixed exchange rate. This standardizes the currency across all transactions, making comparisons and aggregations straightforward.</li>
                <li>Any collective in the expenses data that had fewer than 5 transactions and was its own host was removed. Additionally, any collectives in the income data matching these dropped collectives were also removed. This was done to remove noise of people 'trying out' the platform and adding contributions and expenses to test hosts and collectives.</li>
                <li>In the income data, if the total number of transactions for a collective was less than 5, and if all payment methods for that collective were of type 'host', then these rows were removed. This is done likely to remove any noise or irrelevant data.</li>
                <li>Rows in both expenses and income datasets having a specific pattern in their slug (ending with <string>-<number>.<number>) were removed. This is because this pattern was identified as belonging to spammers.</li>
                <li>Any transactions in both datasets where the host collective was 'opencollective' were removed. This filters out internal payments to the OC Inc team.</li>
            </ul>

            <p>This resulted in a dataset of all expenses and contributions to all real collectives on the platform for the last 12 months.</p>
        """ 
    },
    'collective_analysis_expenses_by_host_and_collective': {
        'id': 'collective-expenses',
        'title': 'Collective Expenses',
        'figs': [fig_dict['treemaps']['expense_amount_by_host_and_collective'], fig_dict['treemaps']['expense_count_by_host_and_collective']],
        'body': """
            <p>In these first two graphs we look at the total amount and count of expenses by host and collective. We look at all hosts and collectives in a chart where the size of the square is proportional to its value.</p>
        """
    },
    
    'collective_analysis_expenses_by_hostfee': {
        'id': 'collective-expenses-by-hostfee',
        'title': 'Expenses by Host Fee',
        'figs': [fig_dict['treemaps']['expense_amount_by_host_fee'], fig_dict['treemaps']['expense_count_by_host_fee']],
        'body': """
            <p>Here we look at the total amount and count of expenses by host fee. We look at all hosts and collective in a chart where the size of the square is proportional to its value.</p>
            <p>Notice that in both cases, the zero-host-fee category is second largest by both counts and amount.</p>
        """
    },
    'collective_analysis_tip_potential': {
        'id': 'collective-income-by-host-and-method',
        'title': 'Tip Potential: Income by Host and Method',
        'figs': [fig_dict['treemaps']['income_amount_by_host_percentage_and_method']],
        'body': """
            <p>How large is the tip potential for each host? Here we look at the total amount of income by host and method. Tips are only possible for contributions, and are unlikely when organizations make larger donations via bank transfer.</p>
            <p>In this chart we categorize the income by host and method. We then color the methods according to their potential for tipping. By doing so we are investiging, particularly for the zero-host-fee hosts, what the potential for tipping is when all contributions are considered.</p>
        """
    },
    'collective_analysis_tip_income': {
        'id': 'collective-tips',
        'title': 'Tip Income: Tip Income by Host and Collective',
        'figs': [fig_dict['treemaps']['tip_amount_amount_by_host_fee_and_host_and_collective']],
        'body': """
            <p>We now look at the total amount of tips by host and collective. We color the squares by if the collective is on a host with a host fee or not. Hover over boxes to see tip income amounts.</p>
            <p>We earn about 240,000 USD in tips from hosts with a host fee, and about 91,000 USD from hosts without a host fee.</p>
        """
    },
    'collective_analysis_crowdfunding': {
        'id': 'crowdfunding',
        'title': 'Crowdfunding Income: Income by Host and Method',
        'figs': [fig_dict['treemaps']['income_crowdfunding']],
        'body': """
            <p>Is Open Collective a crowdfunding platform with money management features, or a money management platform with crowdfunding features? Let's have a look.</p>
            <p>Of the roughly 60M that went through the platform in the last 12 months, about 12M was from crowdfunding. That is about 20% of the total revenue.</p>
            <p>Open Source Collective stands out as the largest crowdfunding host, and the only major host with more income from crowdfunding than from other sources.</p>
        """
    }
}

In [None]:
# Generate HTML sections
intro_html_div = get_html_section(sections['intro'])
active_users_html_div = get_html_section(sections['active_users'], include_figs=True)
activity_counts_html_divs = get_html_figs_section(sections['activity_counts']['figs'], sections['activity_counts'])
user_clusters_html_divs = get_html_figs_section(sections['user_clusters']['figs'], sections['user_clusters'])
tsne_html = get_html_figs_section(sections['user_clusters_tsne']['figs'], sections['user_clusters_tsne'], fig_height=1000)
user_profiles_html_divs = get_html_figs_section(sections['user_profiles']['figs'], sections['user_profiles'], fig_height=1400)
collective_analysis_html_divs = get_html_section(sections['collective_analysis'])
expense_amount_and_count_by_host_html = get_html_section(sections['collective_analysis_expenses_by_host_and_collective'], header_level='h2',include_figs=True, fig_height=600)
expense_amount_and_count_by_host_fee_html = get_html_section(sections['collective_analysis_expenses_by_hostfee'], header_level='h2',include_figs=True, fig_height=600)
income_by_host_and_method_html = get_html_section(sections['collective_analysis_tip_potential'], include_figs=True, header_level='h2',fig_height=600)
tip_amount_amount_by_host_fee_and_host_and_collective_html = get_html_section(sections['collective_analysis_tip_income'], include_figs=True, header_level='h2',fig_height=600)
income_crowdfunding_html = get_html_section(sections['collective_analysis_crowdfunding'], include_figs=True, header_level='h2',fig_height=600)


In [None]:
# Final HTML
final_html = f"""
<html>
    <head>
        <title>Retreat Report: Users, Hosts and Collectives</title>
        <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
        {css_styles}
    </head>

    <body>

        {generate_toc(sections)}

        <div style="margin: 0 70px 0 350px;">

            {intro_html_div}

            {active_users_html_div}

            {activity_counts_html_divs}

            {user_clusters_html_divs}

            {tsne_html}

            {user_profiles_html_divs}

            {collective_analysis_html_divs}

            {expense_amount_and_count_by_host_html}

            {expense_amount_and_count_by_host_fee_html}

            {income_by_host_and_method_html}

            {tip_amount_amount_by_host_fee_and_host_and_collective_html}

            {income_crowdfunding_html}

        </div>

    </body>
</html>
"""

# Write the final HTML to a file
with open("retreat_report_test.html", "w") as f:
    f.write(final_html)