Language composition org overview

This report shows a GitHub-style language bar representing the total language distribution across all repositories in the organization.

In [2]:
unit: str = "files"

In [3]:
import pandas as pd


def update_language_mapping_by_file_type(df: pd.DataFrame) -> pd.DataFrame:
    """
    Update the language columns of the DataFrame by mapping plain text files to languages based on file extension.
    Dataframe should have the following columns:
        - language: str (e.g. 'Plain text')
        - fileType: str (e.g. 'py')
    """

    mappings = {
        "Typescript": ["ts", "tsx"],
        "Javascript": ["js", "jsx"],
        "Kotlin": ["kts"],
    }

    for language, file_types in mappings.items():
        df.loc[
            (df["language"] == "Plain text") & (df["fileType"].isin(file_types)),
            "language",
        ] = language

    return df

In [None]:
import plotly.graph_objects as go
from code_data_science import data_table as dt
import code_data_science.palette as palette

df = dt.read_csv("../samples/language_composition.csv")
df["files"] = 1

# Exit early if there are no rows and render a plot with a message
if len(df) == 0:
    fig = go.Figure()
    fig.add_annotation(
        x=0.5,
        y=0.5,
        text="No rows of data found",
        showarrow=False,
        font=dict(size=20),
        xref="paper",
        yref="paper",
    )
    fig.update_layout(
        xaxis=dict(visible=False),
        yaxis=dict(visible=False),
    )
else:
    # Handle legacy column name
    if "sourceFileType" not in df.columns and "parserClass" in df.columns:
        df.rename(columns={"parserClass": "sourceFileType"}, inplace=True)

    # Extract file extension
    df["fileType"] = df["sourcePath"].str.split(".").str[-1]

    # Map plain text files to proper languages
    update_language_mapping_by_file_type(df)

    # Determine measurement field based on unit parameter
    measurement_field = "files" if unit == "files" else "linesOfText"

    # Group by language, sum measurement field
    language_totals = df.groupby("language", as_index=False)[measurement_field].sum()

    # Calculate total and percentages
    total = language_totals[measurement_field].sum()
    language_totals["percentage"] = (
        language_totals[measurement_field] / total * 100
    ).round(2)

    # Sort by percentage descending
    language_totals = language_totals.sort_values(
        by="percentage", ascending=False
    ).reset_index(drop=True)

    # Group languages under 2% into "Other"
    other_mask = language_totals["percentage"] < 2
    if other_mask.any():
        other_row = pd.DataFrame(
            {
                "language": ["Other"],
                measurement_field: [
                    language_totals.loc[other_mask, measurement_field].sum()
                ],
                "percentage": [
                    language_totals.loc[other_mask, "percentage"].sum().round(2)
                ],
            }
        )
        language_totals = pd.concat(
            [language_totals[~other_mask], other_row], ignore_index=True
        )

    # Re-sort after grouping
    language_totals = language_totals.sort_values(
        by="percentage", ascending=False
    ).reset_index(drop=True)

    # Assign colors from palette
    colors = palette.qualitative()
    color_map = {}
    for i, lang in enumerate(language_totals["language"]):
        color_map[lang] = colors[i % len(colors)]

    # Build the stacked horizontal bar using plotly graph_objects
    fig = go.Figure()

    for _, row in language_totals.iterrows():
        fig.add_trace(
            go.Bar(
                x=[row["percentage"]],
                y=[""],
                orientation="h",
                name=f"{row['language']} ({row['percentage']:.1f}%)",
                marker_color=color_map[row["language"]],
                hovertemplate=(
                    f"{row['language']}: {row['percentage']:.1f}%<extra></extra>"
                ),
                text=f"{row['language']} {row['percentage']:.1f}%"
                if row["percentage"] >= 5
                else "",
                textposition="inside",
                insidetextanchor="middle",
            )
        )

    fig.update_layout(
        barmode="stack",
        height=150,
        margin=dict(l=0, r=0, t=10, b=80),
        xaxis=dict(
            visible=False,
            range=[0, 100],
        ),
        yaxis=dict(
            visible=False,
        ),
        legend=dict(
            orientation="h",
            yanchor="top",
            y=-0.3,
            xanchor="left",
            x=0,
            font=dict(size=12),
        ),
        bargap=0,
        plot_bgcolor="rgba(0,0,0,0)",
        paper_bgcolor="rgba(0,0,0,0)",
    )

    # Round the bar corners
    fig.update_traces(
        marker=dict(cornerradius=5),
    )

In [5]:
# Output the visualization
fig.show(render="plotly_mimetype")