Option 2: Collect the all projects with at least 5 releases by iterating through the JSON manually (instead of using pandas) -- turns out this is a bit faster.

In [7]:
import json
import pandas as pd
import glob
import os
import numpy as np
from datetime import datetime
from scipy.stats import gmean, kurtosis, skew

languages = ["go", "java", "python", "ruby"]
project_details_folder = "project_details"
readmes_folder = "readmes"
releases_folder = "releases"

all_language_releases = []

real_releases = []
all_releases = []

for language in languages:

    total_projects = 0
    total_files = 0
    total_releases = 0
    total_readmes = 0
    more_than_5_releases = 0
    more_than_5_real_releases = 0

    # Grab all the files from within the folder, analyze the releases
    files = glob.glob(f"{releases_folder}/{language}/*_releases.jsonl")

    for file in files:

        all_releases = []
        real_releases = []
        total_files+=1
        owner, repo_name = file.replace("_releases.jsonl", "").split(f"{releases_folder}/{language}\\")[1].split("_", 1)
        with open(file) as f:
            all_releases = [json.loads(line) for line in f]
        all_releases.reverse() # earliest release is at the end of the file

        #print(f'Before - Repo: {repo_name}, Owner: {owner}, Releases: {len(all_releases)} ')

        # Remove any prereleases from this array, so we can split the analysis
        for release in all_releases:
            if release["prerelease"] is False:
                real_releases.append(release)

        #print(f'After - Repo: {repo_name}, Owner: {owner}, Releases: {len(real_releases)} ')

        if len(all_releases) >= 5:
            more_than_5_releases += 1

        # If there are less than 5 releases, do not include this repository
        if len(real_releases) >= 5:
            more_than_5_real_releases+= 1

            real_release_date_diffs = []
            for i in range(len(real_releases[:-1])):
                first_date = datetime.strptime(real_releases[i]['created_at'], "%Y-%m-%dT%H:%M:%SZ")
                second_date = datetime.strptime(real_releases[i+1]['created_at'], "%Y-%m-%dT%H:%M:%SZ")  # 2018-08-12T23:56:21Z
                difference = (second_date - first_date).days
                real_release_date_diffs.append((second_date - first_date).days)

            #df_all_releases = pd.DataFrame.from_records(all_releases)
            #df_real_releases = pd.DataFrame.from_records(real_releases)

            #release_dates_diffs = df_real_releases["created_at"].diff().apply(lambda x: x/np.timedelta64(1, "D")).fillna(0).tolist()[1:]
            release_dates_diffs_min = min(real_release_date_diffs) # TODO: Somehow there are negative date differences? Should check this.
            release_dates_diffs_max = max(real_release_date_diffs)
            release_dates_diffs_mean = np.mean(real_release_date_diffs)

            try:
                release_dates_diffs_gmean = gmean(real_release_date_diffs)
            except:
                release_dates_diffs_gmean = None
            try:
                release_dates_diffs_skewness = skew(real_release_date_diffs)
            except:
                release_dates_diffs_skewness = None
            try:
                release_dates_diffs_kurtosis = kurtosis(real_release_date_diffs)
            except:
                release_dates_diffs_kurtosis = None
            try:    
                release_dates_diffs_cv = np.std(real_release_date_diffs)/release_dates_diffs_mean
            except:
                release_dates_diffs_cv = None

            #release_dates = [date[:-6] for date in real_releases["created_at"].astype(str).tolist()] "release_dates": release_dates,
            real_release_summary = {"owner": owner, "repo": repo_name, "release_count": len(real_releases), "language": language, 
                             "release_dates_diffs": real_release_date_diffs, 
                            "release_dates_diffs_min": release_dates_diffs_min, "release_dates_diffs_max": release_dates_diffs_max, 
                            "release_dates_diffs_mean": release_dates_diffs_mean, "release_dates_diffs_gmean": release_dates_diffs_gmean, 
                            "release_dates_diffs_skewness": release_dates_diffs_skewness, 
                            "release_dates_diffs_kurtosis": release_dates_diffs_kurtosis, "release_dates_diffs_cv": release_dates_diffs_cv}
            all_language_releases.append(real_release_summary)

    print(total_files)
    print(f'{language} has {more_than_5_releases} projects, with more than 5 releases total.')
    print(f'{language} has {more_than_5_real_releases} projects, with more than 5 releases that are not prereleases.')


divide by zero encountered in log


invalid value encountered in log



593
go has 467 projects, with more than 5 releases total.
go has 447 projects, with more than 5 releases that are not prereleases.
1317
java has 999 projects, with more than 5 releases total.
java has 950 projects, with more than 5 releases that are not prereleases.



invalid value encountered in double_scalars



2330
python has 1621 projects, with more than 5 releases total.
python has 1544 projects, with more than 5 releases that are not prereleases.
457
ruby has 291 projects, with more than 5 releases total.
ruby has 287 projects, with more than 5 releases that are not prereleases.


In [8]:
releases = pd.DataFrame(all_language_releases, columns=all_language_releases[0].keys())

display(releases)

Unnamed: 0,owner,repo,release_count,language,release_dates_diffs,release_dates_diffs_min,release_dates_diffs_max,release_dates_diffs_mean,release_dates_diffs_gmean,release_dates_diffs_skewness,release_dates_diffs_kurtosis,release_dates_diffs_cv
0,8treenet,freedom,23,go,"[15, 8, 7, 5, 14, 3, 3, 5, 14, 6, 0, 29, 5, 21...",0,278,35.181818,0.000000,3.145826,9.921707,1.684751
1,a8m,documentdb,5,go,"[111, 315, 31, 664]",31,664,280.250000,163.791299,0.619068,-1.157257,0.872657
2,abutaha,aws-es-proxy,7,go,"[92, 659, 498, 61, 128, 287]",61,659,287.500000,201.861756,0.565922,-1.251793,0.774236
3,achannarasappa,ticker,47,go,"[3, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 2, ...",0,103,12.152174,0.000000,2.328399,4.236803,2.135676
4,adjust,rmq,5,go,"[4, 2, 161, 21]",2,161,47.000000,12.824304,1.112946,-0.701927,1.409163
...,...,...,...,...,...,...,...,...,...,...,...,...
3223,xotahal,fastlane-plugin-semantic_release,23,ruby,"[0, 67, 0, 5, 0, 0, 3, 9, 89, 59, 2, 10, 0, 52...",0,337,57.363636,0.000000,1.820118,2.291979,1.591678
3224,yoshoku,rumale,10,ruby,"[50, 12, 36, 13, 30, 19, 21, 84, 324]",12,324,65.444444,35.584206,2.248032,3.434992,1.434274
3225,yujinakayama,guard-rubocop,12,ruby,"[3, 6, 44, 17, 4, 19, 42, 30, 56, 71, 2754]",3,2754,276.909091,29.387374,2.842227,6.087066,2.829856
3226,yuki24,rambulance,17,ruby,"[2, 75, 12, 127, 114, 513, 447, 64, 451, 1, 2,...",1,513,174.250000,52.440280,0.828138,-0.917084,1.053752


In [9]:
from plotly.express import box

fig = box(releases, x="language", y="release_dates_diffs_gmean", color="language", 
          labels={"language": "Language", "release_dates_diffs_gmean": "Geometric release interval mean"}, height=500, log_y=True)

fig.update_xaxes(tickangle=30).update_traces(boxpoints=False) 

fig.show()

In [10]:
from plotly.express import violin

fig = violin(releases, x="language", y="release_dates_diffs_gmean", color="language", 
             labels={"language": "Language", "release_dates_diffs_gmean": "Geometric release interval mean"}, height=500, log_y=False)

fig.update_xaxes(tickangle=30)

fig.show()

In [11]:
from plotly.express import box

fig = box(releases, x="language", y="release_dates_diffs_cv", color="language", 
          labels={"language": "Language", "release_dates_diffs_cv": "Release interval coefficient of variation"}, height=500, log_y=True)

fig.update_xaxes(tickangle=30)

fig.show()