## Analysis of ProtonDB Linux Distribution


In [1]:
# !git clone https://github.com/bdefore/protondb-data


### Data Preprocessing


[Notebook link](https://github.com/n0k0m3/Personal-Setup/blob/main/ProtonDB_Analysis/analysis.ipynb){: .btn .btn--info }


In [2]:
import os
import pandas as pd
import glob
import tarfile
import json
import plotly.express as px
import plotly.io as pio

pio.renderers.default = "iframe"
px.defaults.width = 600
px.defaults.height = 400


In [3]:
def find_distro(string):
    # Find the distro in the string based on string matching
    os_str = string.split(" ")[0]

    distros = ["ubuntu", "manjaro", "arch", "pop!_os", "debian", "fedora", "linux mint"]

    if os_str.lower() not in distros and len(string.split(" ")) > 1:
        os_str += " " + string.split(" ")[1]

    manjaro_based = ["manjaro"]
    if any(x in os_str.lower() for x in manjaro_based):
        os_str = "Manjaro"

    arch_based = ["arc", "antergos", "gamer", "ctlos", "artix", "endeavour"]
    if any(x in os_str.lower() for x in arch_based):
        os_str = "Arch"

    ubuntu_based = ["kde", "elementary", "zorin", "ubuntu"]
    if any(x in os_str.lower() for x in ubuntu_based):
        os_str = "Ubuntu"

    mint_based = ["lmde", "mint"]
    if any(x in os_str.lower() for x in mint_based):
        os_str = "Linux Mint"

    pop_based = ["pop"]
    if any(x in os_str.lower() for x in pop_based):
        os_str = "Pop!_OS"

    #     suse_based = ["suse"]
    #     if any(x in os_str.lower() for x in suse_based):
    #         os_str = "OpenSUSE"

    #     flatpack = ["freedesktop"]
    #     if any(x in os_str.lower() for x in flatpack):
    #         os_str = "Flatpak"

    debian_based = [
        "steamos",
        "siduction",
        "sparky",
        "devuan",
        "mx",
        "deepin",
        "debian",
    ]
    if any(x in os_str.lower() for x in debian_based):
        os_str = "Debian"

    fedora_based = ["fedora"]
    if any(x in os_str.lower() for x in fedora_based):
        os_str = "Fedora"

    if os_str.lower() not in distros:
        os_str = "Others"

    return os_str


def read_os_from_tar(path):
    with tarfile.open(path, "r") as f:
        data = f.extractfile("reports_piiremoved.json").read()
    data = json.loads(data)
    data = pd.DataFrame(data)

    date_str = os.path.split(path)[1][8:-7]
    month = date_str[:3]
    day = date_str[3]
    year = date_str[5:]
    date_str = pd.to_datetime("-".join([month, day, year]))

    if date_str >= pd.to_datetime("dec-2-2019"):
        df = pd.json_normalize(data["systemInfo"])
    else:
        df = data.dropna(subset=["os"]).copy()

    df["os"] = df["os"].apply(find_distro)
    os_data = df["os"].value_counts()

    os_df = pd.DataFrame(os_data).transpose().reset_index(drop=True)
    os_df["date"] = date_str

    return os_df


In [4]:
# if False:#os.path.exists("proton_db_os.hdf5"):
#     df_report = pd.read_hdf("proton_db_os.hdf5","df")
# else:
#
df_report = pd.DataFrame(
    read_os_from_tar("protondb-data/reports/reports_dec2_2019.tar.gz")
)

for path in glob.glob("protondb-data/reports/*.gz"):
    date_str = os.path.split(path)[1][8:-7]
    month = date_str[:3]
    day = date_str[3]
    year = date_str[5:]
    date_str = pd.to_datetime("-".join([month, day, year]))
    if date_str not in df_report["date"].to_numpy():
        try:
            df_temp = read_os_from_tar(path)
        except:
            continue
        df_report = pd.concat([df_report, df_temp], ignore_index=True)


In [5]:
df_report = df_report.sort_values(by=["date"])


### Cumulative Results


**Note:** On Dec 2nd 2019, ProtonDB contribute workflow changed to a questionnaire, subsequently changed the data structures. All data prior to this date are for reference only and should NOT be inferred.


#### Raw user counts per distro


In [6]:
fig = px.line(
    df_report,
    x="date",
    y=df_report.columns.drop(["date"]),
    title="ProtonDB user count by Distro",
)
fig.show()


#### Normalized distro market share on ProtonDB


In [7]:
fig = px.area(
    df_report,
    x="date",
    y=df_report.columns.drop(["date"]),
    groupnorm="percent",
    title="Distro Market Share on ProtonDB",
)
fig.show()


#### Normalized distro market share on ProtonDB (Merged distro base)


In [8]:
normalizedDataFrame_combined = df_report.copy()
normalizedDataFrame_combined["Arch-based"] = normalizedDataFrame_combined[
    ["Arch", "Manjaro"]
].sum(axis=1)
normalizedDataFrame_combined = normalizedDataFrame_combined.drop(
    columns=["Manjaro", "Arch"]
)
normalizedDataFrame_combined["Debian-based"] = normalizedDataFrame_combined[
    ["Linux Mint", "Ubuntu", "Pop!_OS", "Debian"]
].sum(axis=1)
normalizedDataFrame_combined = normalizedDataFrame_combined.drop(
    columns=["Linux Mint", "Pop!_OS", "Ubuntu", "Debian"]
)


In [9]:
fig = px.area(
    normalizedDataFrame_combined,
    x="date",
    y=normalizedDataFrame_combined.columns.drop(["date"]),
    groupnorm="percent",
    title="Distro-base Market Share on ProtonDB",
)
fig.show()


Arch Linux and Arch-based distro is on the rise, while Debian-based (Ubuntu) distros are on the decline.


### Steam Deck/SteamOS effect on ProtonDB market share analysis

Is there a shift due to Steam Deck and SteamOS? We'll filter from Jan 1st 2022 till now to see if there is a shift in the market share.


In [10]:
fig = px.area(
    df_report[df_report["date"] > pd.to_datetime("jan-1-2022")],
    x="date",
    y=df_report.columns.drop(["date"]),
    groupnorm="percent",
    title="Distro Market Share on ProtonDB from 01/01/2022",
    markers=True
)
fig.show()


Let's group the distro by their bases.


In [11]:
fig = px.area(
    normalizedDataFrame_combined[
        normalizedDataFrame_combined["date"] > pd.to_datetime("jan-1-2022")
    ],
    x="date",
    y=normalizedDataFrame_combined.columns.drop(["date"]),
    groupnorm="percent",
    title="Distro-base Market Share on ProtonDB from 01/01/2022",
    markers=True
)
fig.show()


As we can see, the market share graph doesn't show that there's any effect of Steam Deck release on ProtonDB user base. Let's check monthly increased user counts to see if there's any change.


#### Monthly delta of user counts


In [12]:
df_delta = df_report.sort_values(by="date", ascending=False) - df_report.sort_values(
    by="date", ascending=False
).shift(-1)
df_delta.pop("date")
df_delta["date"] = df_report["date"]


In [13]:
fig = px.line(
    df_delta,
    x="date",
    y=df_delta.columns.drop(["date"]),
    title="ProtonDB monthly user count change per Distro",
)
fig.show()


Let's zoom in after Dec 2nd 2019 (when the questionnaire was released)


In [14]:
df_delta = df_delta[df_delta["date"] > pd.to_datetime("dec-2-2019")]
fig = px.line(
    df_delta,
    x="date",
    y=df_delta.columns.drop(["date"]),
    title="ProtonDB monthly user count change per Distro",
)
fig.show()


Debian and Fedora experienced a spike in new user count. More analysis on the market is needed.


In [15]:
normalizedDataFrame_combined = df_delta.copy()
normalizedDataFrame_combined["Arch-based"] = normalizedDataFrame_combined[
    ["Arch", "Manjaro"]
].sum(axis=1)
normalizedDataFrame_combined = normalizedDataFrame_combined.drop(
    columns=["Manjaro", "Arch"]
)
normalizedDataFrame_combined["Debian-based"] = normalizedDataFrame_combined[
    ["Linux Mint", "Ubuntu", "Pop!_OS", "Debian"]
].sum(axis=1)
normalizedDataFrame_combined = normalizedDataFrame_combined.drop(
    columns=["Linux Mint", "Pop!_OS", "Ubuntu", "Debian"]
)


In [16]:
fig = px.line(
    normalizedDataFrame_combined,
    x="date",
    y=normalizedDataFrame_combined.columns.drop(["date"]),
    title="ProtonDB monthly user count change per Distro-base",
)
fig.show()


Debian-based new installation is on a constant decline, while arch and "other" distros are on a rise. As the market share of "other" distros is not as significant as Debian or Arch derivatives, we won't go deeper into this category.


In [17]:
fig = px.line(
    normalizedDataFrame_combined[
        normalizedDataFrame_combined["date"] > pd.to_datetime("01/01/2022")
    ],
    x="date",
    y=normalizedDataFrame_combined.columns.drop(["date"]),
    title="ProtonDB monthly user count change per Distro-base from 01/01/2022",
    markers=True
)
fig.show()


New install of both Arch and Debian-based distros is on plateau. So for now, the release of Steam Deck is not that big of a disruption. Interestingly, by the end of March there is a spike in installation of Fedora due to a beta release of Fedora 36.
