In [19]:
import numpy as np
import pandas as pd
from datetime import datetime

In [23]:
df = pd.read_csv("/home/sv/data/snapshot_fork_duplicates_removed_fixed.csv.gz", header=0, compression="gzip", names=["snapshot_id", "fork_id", "date"])
df.head()

Unnamed: 0,snapshot_id,fork_id,date
0,10000129,10000126,1433557281
1,1000048,1000047,1410017561
2,10002669,10002668,1389857537
3,10002835,42351765,1418498280
4,10002835,25311588,1398130898


In [25]:
df['year'] = df.apply(lambda row: datetime.utcfromtimestamp(row['date']).year, axis=1)
df['month'] = df.apply(lambda row: datetime.utcfromtimestamp(row['date']).month, axis=1)
df['day'] = df.apply(lambda row: datetime.utcfromtimestamp(row['date']).day, axis=1)

In [26]:
df.head()

Unnamed: 0,snapshot_id,fork_id,date,year,month,day
0,10000129,10000126,1433557281,2015,6,6
1,1000048,1000047,1410017561,2014,9,6
2,10002669,10002668,1389857537,2014,1,16
3,10002835,42351765,1418498280,2014,12,13
4,10002835,25311588,1398130898,2014,4,22


In [27]:
df['year'].nunique()

9

In [42]:
snapshot_fork_dict = dict()
for i, row in df.iterrows():
    if row.snapshot_id not in snapshot_fork_dict:
        snapshot_fork_dict[row.snapshot_id] = dict()
        
    if row.year not in snapshot_fork_dict[row.snapshot_id]:
        snapshot_fork_dict[row.snapshot_id][row.year] = [1, dict()]
    else:
        snapshot_fork_dict[row.snapshot_id][row.year][0] += 1
    
    if row.month not in snapshot_fork_dict[row.snapshot_id][row.year][1]:
        snapshot_fork_dict[row.snapshot_id][row.year][1][row.month] = 1
    else:
        snapshot_fork_dict[row.snapshot_id][row.year][1][row.month] += 1

print("The number of project with forks:", len(snapshot_fork_dict))
for snapshot, revision_date_set in snapshot_fork_dict.items():
    print(snapshot, len(revision_date_set))

('The number of project with forks:', 16)
(10005216, 2)
(10000129, 1)
(1000482, 4)
(1000355, 1)
(10004260, 1)
(10011144, 1)
(10010985, 1)
(10011244, 9)
(10008940, 2)
(10002669, 1)
(10004589, 3)
(1000048, 1)
(10002835, 6)
(10010708, 2)
(10010646, 1)
(10006621, 1)


In [48]:
snapshot_ids = []
count_forks = []
count_years_exist = [] # the number of years between the first and last forks
count_months_exist = []
mean_forks = []
count_years_with_forks = []
count_months_with_forks = []
for snapshot, revision_date_set in snapshot_fork_dict.items():
#     print(snapshot)
    
    forks_year = []
    forks_month = []
    for year, (frequency, revision_date_set_month) in revision_date_set.items():
        forks_year.append((year, frequency))
        for month, freq in revision_date_set_month.items():
            forks_month.append((year, month, freq))
    forks_year.sort()
    forks_month.sort()
    count_fork = 0
    count_years_with_fork = 0
    count_months_with_fork = 0
    years = []
    months = []
    days = []
    for year, frequency in forks_year:
        years.append(year)
        count_fork += frequency
        count_years_with_fork += 1
#         print(year, frequency)
    for year, month, freq in forks_month:
        months.append((year, month))
        count_months_with_fork += 1
    months_exist = 0
    if months[-1][0] == months[0][0]:
        months_exist = months[-1][1] - months[0][1] + 1
    elif months[-1][1] < months[0][1]:
        extra_months = 12 * (months[-1][0] - months[0][0]) - 1
        months_exist = months[-1][1] + (12 - months[0][1] + 1) + extra_months
    else:
        extra_months = 12 * (months[-1][0] - months[0][0])
        months_exist = months[-1][1] - months[0][1] + extra_months
        
    snapshot_ids.append(snapshot)
    count_forks.append(count_fork)
    count_years_with_forks.append(count_years_with_fork)
    count_months_with_forks.append(count_months_with_fork)
    years_exist = years[-1] - years[0] + 1
    count_years_exist.append(years_exist)
    count_months_exist.append(months_exist)
    mean_forks.append(count_fork / years_exist)

df_fork_features = pd.DataFrame.from_dict({
    "snapshot_id": snapshot_ids,
    "count_forks": count_forks,
    "mean_forks": mean_forks,
    "count_years_with_forks": count_years_with_forks,
    "count_months_with_forks": count_months_with_forks,
    "count_years_exist": count_years_exist,
    "count_months_exist": count_months_exist
}, orient='index').transpose()

df_fork_features.head(16)

# number of forks
# mean number of forks by year
# number of year with at least one fork
# number of month with at least one fork
# number of days with at least one fork
# longest streak of years with at least one fork
# longest streak of months with at least one fork


Unnamed: 0,count_forks,snapshot_id,count_years_with_forks,mean_forks,count_years_exist,count_months_exist,count_months_with_forks
0,5,10005216,2,2,2,14,5
1,1,10000129,1,1,1,1,1
2,5,1000482,4,1,4,45,5
3,1,1000355,1,1,1,1,1
4,1,10004260,1,1,1,1,1
5,1,10011144,1,1,1,1,1
6,1,10010985,1,1,1,1,1
7,46482,10011244,9,5164,9,97,86
8,3,10008940,2,1,2,16,3
9,1,10002669,1,1,1,1,1


In [50]:
df_fork_features.to_csv("/home/sv/data/fork_pattern.csv")