In [None]:
# NOTES
# if not running in local dev environment (perceval_venv / health-env):
#
# Install a pip package in the current Jupyter kernel
# >import sys
# >!{sys.executable} -m pip install grimoirelab
#
# Command line usage: 
# >!pip3 install grimoirelab
# >!perceval git 'https://github.com/bitcoin/bitcoin.git' --from-date '2021-07-14'
#
# web tutorial: https://chaoss.github.io/grimoirelab-tutorial/perceval/git.html
# perceval docs: https://perceval.readthedocs.io/en/latest/

In [None]:
from perceval.backends.core.git import Git
from datetime import datetime
import dateutil.rrule
import dateutil.tz
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
import seaborn as sns
import time
import csv

# file I/O for running a colab notebook
#from google.colab import files

In [6]:
def fetchTimeZonesArray(repoURL, repoName, from_Date, to_Date):
    # directory for letting Perceval clone the git repo
    # must be unique for the repo because it is reused
    # Date format is 'YYYY,MM,DD'
    dir_string = '/tmp/'+repoName+'.git'
    repo_dir = dir_string

    # create a Git object, pointing to repo_url: the github repo path
    # and repo_dir: the local path for cloning
    repo_object = Git(uri = repoURL, gitpath = repo_dir)
    count = 0
    noTZ = 0
    # timeData array(list) has 1 position for each timezone
    timeData = [0] * 24
    #res = 0
    for commit in repo_object.fetch(from_date = from_Date, to_date = to_Date):
        # handy for storing all data
        #rawData.append(commit)
        count += 1
        # commit date is a string
        time = commit['data']['CommitDate']
        
        # this is now an aware date object and can be universally sequenced
        dateTimeObject = datetime.strptime(' '.join(time.split(' ')), '%a %b %d %H:%M:%S %Y %z')
        
        # convert to an offset in hours
        offset = dateTimeObject.utcoffset().seconds // 3600
        
        # Write the info to a file
        #with open("logfile.txt", "a") as f:
        #    f.write(f"Commit date: {time}, UTC offset: {offset}\n")
        
        # NZ is UTC+12 (ahead); +13 is also -11 geographically, or 13-24=-11 
        # due to polar plotting all >12 shifted by -24, so:
        # +12 -> -12: e.g NZ; +13/-11 during NZDT daylight savings
        # +13 -> -11: Tokelau, Samoa, Tonga (Midway, Niue are already -11)
        # +14 -> -10: Christmas Island, Kiwibati  
        # half & quarter-hour offsets are taken as the floor, e.g. India (+5.5) -> +5
        if offset > 11:
            offset = offset - 24
        
        # increment timeData[offset] counter
        timeData[offset+12] += 1
        
        if count % 1000 == 0:
            print('.', end='')
        
    # stats
    if count > 0: percentage = (noTZ/count)*100
    else: percentage = 0
    # maybe later write these to the header
    # >check that 0.5 timezone offsets get binned
    print('finished fetching data from:', repoName)
    #print('total commits counted:', count)
    if noTZ != 0: print('number with null timezone:', noTZ, '(', round(percentage, 1),'%)')

    # return the list of timezone offsets
    return timeData
    #return rawData

In [None]:
repos = pd.read_csv('merged.csv')

In [7]:
repos5 = repos[:15].copy()

In [None]:
# collect 6 months up to date of final GHTorrent download (March 26, 2022)
fromDate = datetime(2021, 9, 25) # from is inclusive
toDate = datetime(2022, 3, 26)  # to is exclusive
    
# cumulative main loop
# make a default df
allData = pd.DataFrame()
#total = pd.DataFrame()
count = 0

for index, row in repos5.iterrows():
    # terminal logging
    count += 1
    repoName = row['name']
    print('>> Repo #',count, repoName)
    repoURL = row['url']
    
    if (repoURL != 'private') and (repoURL != 'exclude') and (not pd.isna(repoURL)):

        try:
            data = fetchTimeZonesArray(repoURL, repoName, fromDate, toDate) 

            # concatenate data
            df = pd.DataFrame(data)
            #total = total.append(df, ignore_index = True)
            df.columns = [repoURL]
            allData = pd.concat([allData, df], axis=1)

        except Exception as e:
            print(f"An error occurred while fetching the repository {repoURL}. Error: {e}")
            # You can also log the error message to a log file
            # logger.error(f"An error occurred while fetching the repository {repoURL}. Error: {e}")
            data = None

In [None]:
allData.info()

In [None]:
allData.to_csv("allData600.csv",index=False)