Language composition top languages

This report shows how frequently each programming language appears among the top 3 languages across all repositories in the organization.

In [None]:
unit: str = "files"
top_n: int = 3

In [2]:
import pandas as pd


def update_language_mapping_by_file_type(df: pd.DataFrame) -> pd.DataFrame:
    """
    Update the language columns of the DataFrame by mapping plain text files to languages based on file extension.
    Dataframe should have the following columns:
        - language: str (e.g. 'Plain text')
        - fileType: str (e.g. 'py')
    """

    mappings = {
        "Typescript": ["ts", "tsx"],
        "Javascript": ["js", "jsx"],
        "Kotlin": ["kts"],
    }

    for language, file_types in mappings.items():
        df.loc[
            (df["language"] == "Plain text") & (df["fileType"].isin(file_types)),
            "language",
        ] = language

    return df

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from code_data_science import data_table as dt
import code_data_science.palette as palette

df = dt.read_csv("../samples/language_composition.csv")
df["files"] = 1

# Exit early if there are no rows and render a plot with a message
if len(df) == 0:
    fig = go.Figure()
    fig.add_annotation(
        x=0.5,
        y=0.5,
        text="No rows of data found",
        showarrow=False,
        font=dict(size=20),
        xref="paper",
        yref="paper",
    )
    fig.update_layout(
        xaxis=dict(visible=False),
        yaxis=dict(visible=False),
    )
else:
    # Handle legacy column name
    if "sourceFileType" not in df.columns and "parserClass" in df.columns:
        df.rename(columns={"parserClass": "sourceFileType"}, inplace=True)

    # Build repository identifier
    df["repository"] = (
        df["repositoryOrigin"]
        + ":"
        + df["repositoryPath"]
        + ":"
        + df["repositoryBranch"]
    )

    # Extract file extension
    df["fileType"] = df["sourcePath"].str.split(".").str[-1]

    # Map plain text files to proper languages
    update_language_mapping_by_file_type(df)

    # Determine measurement field based on unit parameter
    measurement_field = "files" if unit == "files" else "linesOfText"

    # Group by repository and language, sum the measurement field
    df_grouped = df.groupby(["repository", "language"], as_index=False)[
        measurement_field
    ].sum()

    # Filter to top N languages per repository (0 means all)
    if top_n > 0:
        df_grouped["rank"] = df_grouped.groupby("repository")[measurement_field].rank(
            method="first", ascending=False
        )
        df_filtered = df_grouped[df_grouped["rank"] <= top_n]
        top_label = f"top {top_n}"
    else:
        df_filtered = df_grouped
        top_label = "all"

    # Count how many repos each language appears in
    language_repo_counts = (
        df_filtered.groupby("language")["repository"]
        .nunique()
        .reset_index(name="repo_count")
    )

    # Calculate total number of unique repositories
    total_repos = df["repository"].nunique()

    # Calculate percentage
    language_repo_counts["percentage"] = (
        language_repo_counts["repo_count"] / total_repos * 100
    ).round(1)

    # Sort by repo_count descending (ascending=True for plotly horizontal bar)
    language_repo_counts = language_repo_counts.sort_values(
        by="repo_count", ascending=True
    )

    # Create bar label text
    language_repo_counts["label"] = language_repo_counts.apply(
        lambda row: (
            f"{row['language']} - {row['repo_count']:,} repos ({row['percentage']}%)"
        ),
        axis=1,
    )

    # Set the color palette
    colors = palette.qualitative()

    # Calculate dynamic height
    num_languages = len(language_repo_counts)
    height_per_bar = 30
    total_height = max(num_languages * height_per_bar + 150, 300)

    fig = px.bar(
        language_repo_counts,
        x="repo_count",
        y="language",
        orientation="h",
        text="label",
        color="language",
        color_discrete_sequence=colors,
        height=total_height,
    )

    fig.update_layout(
        xaxis_title=f"Number of repos ({top_label} by {unit})",
        yaxis_title="Language",
        showlegend=False,
        margin=dict(l=0, r=0, t=30, b=0),
    )

    fig.update_traces(
        textposition="inside",
        insidetextanchor="start",
    )

In [4]:
# Output the visualization
fig.show(render="plotly_mimetype")