In [None]:
# run-but-hide
import sys
sys.path.insert(0, "..")

In [None]:
import numpy as np
from elastipy import Search, query

They always say: *put the imports at the top!*


# git commit analytics

Below we use a lot of :link:`pandas` and plotting to get insight into the community of an open source project. 

To explore a repository of your choice move to `elastipy/examples/` and call:

```bash
python gitlogs.py <project-name> path/to/git-repo
```

If you are `cloning` a repository and are just interested in commits you can somewhat limit the size on disk with:

```bash
git clone <repo-url> --no-checkout
```

Replace the `<project-name>` with the name of the project and change the value below in the `notebook`:   

In [None]:
PROJECT = "pandas"

def search():
    return Search(f"elastipy-example-commits-{PROJECT}")

In [None]:
# run-but-hide

# make sure that we actually have results when building
# the export for the documentation
assert search().execute().total_hits

## activity

### commits per week

In [None]:
s = search()
agg = s.agg_date_histogram("date", calendar_interval="week")
df = agg.execute().df(to_index=True)
df["commits/week"] = df.pop("date.doc_count")
df["smooth"] = df.rolling(window=50).mean()
df.plot(figsize=(15,4), color=["lightblue", "blue"])

### additions/deletions per week

In [None]:
s = search()
agg = s.agg_date_histogram("date", calendar_interval="month")
agg.metric_sum("add", field="changes.additions")
agg.metric_sum("del", field="changes.deletions")
df = agg.execute().df(to_index=True, exclude="*doc_count")
#df = df.rolling(window=10).mean()[["add", "del"]]
df.plot.line(color=["green", "pink"], figsize=(15,4))

### commits per weekday/hour for each year

In [None]:
def commits_per(field, interval="year"):
    s = search()
    agg = s.agg_date_histogram(interval, calendar_interval=interval)
    #agg = s.agg_terms("author", field="author")
    agg = agg.agg_terms("weekday", field=field, size=100)
    agg.execute().plot.heatmap(
        sort=True, transpose=True, 
        annot=False, fmt=".0f", cmap="gray_r", figsize=(15, .3), 
    )
commits_per("timestamp_weekday")
commits_per("timestamp_hour")

## authors

### top 3 authors per year

In [None]:
s = search()
agg = s.agg_date_histogram("date", calendar_interval="year")
agg = agg.agg_terms("author", field="author", size=3)
agg_top3_authors = agg
agg.execute().df(to_index=True, flat="author", exclude="*doc_count").plot.bar(figsize=(15,4), stacked=True)

#### commits of all top 3 authors  

In [None]:
top_authors = set(k[1] for k in agg_top3_authors.keys())

s = search()
agg = s.agg_filters("author", filters={key: query.Term("author", key) for key in top_authors})
agg = agg.agg_date_histogram("date", calendar_interval="year")
agg.execute().plot.heatmap(
    sort=True, replace={0: np.nan},
    annot=True, fmt=".0f", figsize=(15, .6), cmap="gray_r"
)

### top 3 average-additions per author per year

In [None]:
s = search()
agg = s.agg_filters("author", filters={key: query.Term("author", key) for key in top_authors})
agg = agg.agg_date_histogram("date", calendar_interval="year")
agg = agg.metric_avg("avg-add", field="changes.additions", return_self=True)
agg.execute().plot.heatmap(
    sort=True, replace={0: np.nan},
    annot=True, fmt=".0f", figsize=(15, .6), cmap="gray_r"
)

### number of authors per year

In [None]:
s = search()
global_authors = s.metric_cardinality(field="author", return_self=True)
agg = s.agg_date_histogram("year", calendar_interval="year")
agg = agg.metric_cardinality("authors", field="author")
agg.execute().plot.bar("year", "authors", figsize=(15, 4))
print(next(global_authors.values()), "authors at all")

---

## commit messages

### the first ten commit messages

In [None]:
s = search().sort("timestamp")
# s = s.range("timestamp", gte="2020")
for d in s.execute().documents:
    print(("-- %(timestamp)s %(hash)s\n%(message)s" % d).strip() + "\n")

### significant terms by year

In [None]:
def significant_terms_by_year(s, field, size=4, shard_size=100):
    agg = s.copy().agg_date_histogram("year", calendar_interval="year")
    agg = agg.agg_significant_terms(field=field, size=size, shard_size=shard_size)
    keywords = set(k[-1] for k in agg.execute().keys())

    agg = s.agg_date_histogram("date", calendar_interval="year")
    agg = agg.agg_filters("word", filters={key: query.Term(field, key) for key in keywords})
    agg.execute().plot.heatmap(
        sort=True, replace={0: np.nan},
        transpose=True, annot=True, fmt=".0f", figsize=(.3, .7), cmap="gray_r"
    )
    
significant_terms_by_year(search(), "message")

### significant terms by author

In [None]:
def significant_terms_by_terms(s, split_field, terms_field, split_size=30, size=3, shard_size=100):
    agg = s.copy().agg_terms(split_field, field=split_field, size=split_size)
    agg = agg.agg_significant_terms("term", field=terms_field, size=size, shard_size=shard_size)
    df = agg.execute().df(include=["term", "term.doc_count"])
    
    # find max count of all significant terms
    df = df.groupby("term").max()
    # print(df.describe())
    
    # and drop everything above a high percentile 
    df = df[df < df.quantile(.8)].dropna()
    keywords = list(df.index)

    agg = s.agg_terms(split_field, field=split_field, size=split_size)
    agg = agg.agg_filters("term", filters={key: query.Term(terms_field, key) for key in keywords})
    agg.execute().plot.heatmap(
        sort=True, transpose=True, replace={0: np.nan}, 
        annot=True, fmt=".0f", figsize=(.23, .6), cmap="gray_r"
    )
    
significant_terms_by_terms(search(), "author", "message")

## files

### overall top 50 edited files per year

In [None]:
s = search()
agg = s.agg_terms(field="changes.file", size=50)
agg = agg.agg_date_histogram("date", calendar_interval="year")
df = agg.execute().plot.heatmap(
    sort=True, replace={0: np.nan},
    annot=True, fmt=".0f", figsize=(.3, 1.5), cmap="gray_r"
)

### significant changed files by year

In [None]:
s = search().param(rest_total_hits_as_int=True)
# remove version specific files
s = ~s.query_string("changes.file: *.txt *.rst")
significant_terms_by_year(s, "changes.file")

### significant changed files by author

In [None]:
significant_terms_by_terms(search(), "author", "changes.file")

###  which files get edited together

In [None]:
s = search()
s = s.query_string("changes.file: __init__.py")

agg = s.agg_terms(field="changes.file", size=50)
agg = agg.agg_date_histogram("date", calendar_interval="year")
try:
    agg.execute().plot.heatmap(figsize=(.3, 1.5), cmap="gray_r")
except ValueError:
    pass