Language composition by repository

This report shows the different languages that are used in each repository.

In [None]:
import plotly.express as px
import pandas as pd
from code_data_science import data_table as dt, data_grid as moderne_data_grid
import code_data_science.palette as palette

df = dt.read_csv("../samples/language_composition.csv")

# Exit early if there are no rows and render a plot with a message
if len(df) == 0:
    fig = px.treemap(names=[], parents=[])
    fig.add_annotation(
        x=0.5, y=0.5, text="No rows of data found", showarrow=False, font=dict(size=20)
    )
else:
    # concat origin:path:branch into repository
    df["repository"] = (
        df["repositoryOrigin"]
        + ":"
        + df["repositoryPath"]
        + ":"
        + df["repositoryBranch"]
    )
    df["fileType"] = df["sourcePath"].str.split(".").str[-1]
    df_file_grouped = df.groupby(
        by=["repository", "language", "fileType", "parserClass", "hasParseFailures"],
        as_index=False,
    ).sum(numeric_only=True)

    path = ["repository", "language", "fileType"]

    # number of rows in the dataframe
    total_files = len(df["sourcePath"])

    # number of rows with a language of Other/unknown/unparseable
    total_unparseable_files = len(df[df["language"] == "Other/unknown/unparseable"])

    # number of rows with a file type of java but a language of Other/unknown/unparseable
    total_java_unparseable_files = len(
        df[(df["language"] == "Other/unknown/unparseable") & (df["fileType"] == "java")]
    )

    # number of rows with parse failures
    total_parse_failures = len(df[df["hasParseFailures"] == True])

    # number of rows with a file type of java and parse failures
    total_java_parse_failures = len(
        df[(df["hasParseFailures"] == True) & (df["fileType"] == "java")]
    )

    # sum of lines of text in java files affected by parse failures
    total_lines_of_java_affected_by_parse_failures = df[
        (df["hasParseFailures"] == True) & (df["fileType"] == "java")
    ]["linesOfText"].sum()

    # Set the color palette
    colors = palette.qualitative()

    fig = px.treemap(
        df_file_grouped, path=path, values="linesOfText", color_discrete_sequence=colors
    )

    fig.update_layout(
        margin=dict(t=28, l=0, r=0, b=0),
        annotations=[
            dict(
                x=0.01,
                y=1.055,
                showarrow=False,
                borderwidth=1,
                text="<b>More info</b>",
                hovertext=(
                    f"""
            <b>Total files:</b> {"{:,}".format(total_files)} </br> </br>
            <b>Other/unknown/unparseable files:</b> {"{:,}".format(total_unparseable_files)}</br>
            <b>Other/unknown/unparseable .java files:</b> {"{:,}".format(total_java_unparseable_files)} </br>
            <b>Files with parse failures:</b> {"{:,}".format(total_parse_failures)} </br>
            <b>.java files with parse failures:</b> {"{:,}".format(total_java_parse_failures)} </br>
            <b>Total lines of text in .java files with parse failures:</b> {"{:,}".format(total_lines_of_java_affected_by_parse_failures)}
            """
                ),
                xref="paper",
                yref="paper",
            )
        ],
    )

    fig.update_traces(
        marker=dict(cornerradius=3),
    )

    fig.data[0].textinfo = "label+text+value"

In [None]:
# Output the visualization
fig.show(render="plotly_mimetype")