In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import ticker
import re

In [2]:
import tools

In [3]:
data = pd.read_json("data/flink/commits_20220602-16h05m47s_apache_flink_master_commits.txt", lines=True)
tools.initialize_datetime(data)
data = data.sort_values(by='authoredDate')

## Data Quality

Do contributors use multiple logins?

In [4]:
contributor = data[['authorName', 'authorLogin', 'authorEmail', 'authorDatabaseId']].drop_duplicates()

In [5]:
contributor_count_by_id = contributor.groupby('authorDatabaseId').count().sort_values('authorLogin')

In [6]:
for col in ['authorName', 'authorLogin', 'authorEmail']:
    n_duplicates = contributor_count_by_id.loc[contributor_count_by_id[col] > 1, col].shape[0]
    n_zero = contributor_count_by_id.loc[contributor_count_by_id[col] == 0, col].shape[0]
    print(f"Column {col} has {n_duplicates} duplicate entries and {n_zero} fields that have no entry for a single database id")


Column authorName has 0 duplicate entries and 191 fields that have no entry for a single database id
Column authorLogin has 0 duplicate entries and 0 fields that have no entry for a single database id
Column authorEmail has 0 duplicate entries and 0 fields that have no entry for a single database id


The database id to login and emails relationship seems to be unique.

I am a bit surprised here, as the authorLogin can be changed by the over time.

## Message

In [7]:
starting_with_flink_prefix = data['message'].str.startswith('[FLINK-').value_counts()
starting_with_flink_prefix_perc = starting_with_flink_prefix / starting_with_flink_prefix.sum() * 100

print(f"{starting_with_flink_prefix_perc.loc[True]: 6.2f} % of all commits start with the [FLINK-XXX] tag")

 60.91 % of all commits start with the [FLINK-XXX] tag


## Mail

In [8]:
data['authorEmail'].drop_duplicates().str.replace(re.compile(r'.*[\@]'), '').value_counts().sort_values().tail(20)

brynski.pl            1
dtstack.com           1
inspur.com            2
pm.me                 2
sina.com              2
yandex.ru             2
corp.netease.com      2
live.cn               2
googlemail.com        2
microsoft.com         2
tu-berlin.de          2
intel.com             3
hotmail.com           5
126.com               7
foxmail.com           8
outlook.com           9
qq.com               26
163.com              33
apache.org           41
gmail.com           237
Name: authorEmail, dtype: int64