# GitHub Repo Selection Process

In [38]:
import pandas as pd
import random
from github import Github
import pickle
import sqlite3 as sql

g = Github('nicholas-alonzo', '')

## Repo Conditions

Sample 10 repos with the following conditions

- Is not by an organization
- Created on 2016-01-01 and/or later
- Is in English
- Has no activity before 2016-01-01 (manually check their activity on GitHub)
- Has at least 5 contributors to date
- Has between 75 and 100 forks to date
- Is at least a megabyte in size
- Has at least 100 stars to date
- Has at least 10 open issues to date
- A push occured sometime after 2016-12-31

In [2]:
# THIS WAS RUN MAY 7TH 2017 ~ 2:00 AM PST
repos = g.search_repositories(query='created:>=2016-01-01 forks:75..100 size:>=1000 stars:>=100')
reposL = list(repos)

repos_subL = []

for repo in reposL:
    user_count = 0
    cond1 = repo.organization
    cond2 = repo.open_issues_count >= 10
    cond3 = repo.pushed_at >= pd.to_datetime('2016-12-31')
    if not cond1 and cond2 and cond3:
        for user in repo.get_contributors():
            user_count += 1
            if user_count >= 5:
                repos_subL.append(repo)
                break
            else:
                pass
    else:
        pass

### Save the potential repos 

In [6]:
# pickle.dump( repos_subL, open( "repos.p", "wb" ) )

L = pickle.load( open( "repos.p", "rb" ) )

## Sample 10 repos

Here we'll have to check maunally that the repos
- are in English
- have no activity before 2016-01-01

In [10]:
random.seed(32824)
random.sample(L, 10)

[Repository(full_name="diegohaz/arc"),
 Repository(full_name="timusus/RecyclerView-FastScroll"),
 Repository(full_name="mikeal/roll-call"),
 Repository(full_name="tidusjar/Ombi"),
 Repository(full_name="JimBobSquarePants/ImageSharp"),
 Repository(full_name="Wilfred/remacs"),
 Repository(full_name="TheOfficialFloW/VitaShell"),
 Repository(full_name="scop/bash-completion"),
 Repository(full_name="andrew-worsfold/tailor"),
 Repository(full_name="krzysztofzablocki/Sourcery")]

- diegohaz/arc
- timusus/RecyclerView-FastScroll
- mikeal/roll-call
- tidusjar/Ombi
- TheOfficialFloW/VitaShell
- andrew-worsfold/tailor
- krzysztofzablocki/Sourcery

In [4]:
random.sample(L, 3)

[Repository(full_name="nolanlawson/optimize-js"),
 Repository(full_name="MoyanZitto/keras-cn"),
 Repository(full_name="jshjohnson/Choices")]

- nolanlawson/optimize-js
- jshjohnson/Choices

In [5]:
random.sample(L, 1)

[Repository(full_name="BrandonJoffe/home_surveillance")]

The final repos with the following conditions are

- diegohaz/arc
- timusus/RecyclerView-FastScroll
- mikeal/roll-call
- tidusjar/Ombi
- TheOfficialFloW/VitaShell
- andrew-worsfold/tailor
- krzysztofzablocki/Sourcery
- nolanlawson/optimize-js
- jshjohnson/Choices

## Selecting the root repo and their forks 

In [90]:
final_repos = ['diegohaz/arc', 'timusus/RecyclerView-FastScroll', 'mikeal/roll-call', 'tidusjar/Ombi',
              'TheOfficialFloW/VitaShell', 'andrew-worsfold/tailor', 'krzysztofzablocki/Sourcery',
              'nolanlawson/optimize-js', 'jshjohnson/Choices', 'BrandonJoffe/home_surveillance']

root_repos = []
for repo in L:
    if repo.full_name in final_repos:
        root_repos.append(repo)

root_repos

[Repository(full_name="nolanlawson/optimize-js"),
 Repository(full_name="krzysztofzablocki/Sourcery"),
 Repository(full_name="jshjohnson/Choices"),
 Repository(full_name="mikeal/roll-call"),
 Repository(full_name="diegohaz/arc"),
 Repository(full_name="andrew-worsfold/tailor"),
 Repository(full_name="timusus/RecyclerView-FastScroll"),
 Repository(full_name="tidusjar/Ombi"),
 Repository(full_name="TheOfficialFloW/VitaShell"),
 Repository(full_name="BrandonJoffe/home_surveillance")]

In [99]:
repos = {}
for repo in root_repos:
    repos[repo] = {'repo_id': repo.id, 'full_name': repo.full_name,
                   'name': repo.name, 'created': repo.created_at,
                   'fork': repo.fork, 'owner_id': repo.owner.id, 'forked_from':None}
    
    for fork in repo.get_forks():
        repos[fork] = {'repo_id': fork.id, 'full_name': fork.full_name,
                       'name': fork.name, 'created': fork.created_at,
                       'fork': fork.fork, 'owner_id': fork.owner.id, 'forked_from':repo.id}

In [101]:
repos_table = pd.DataFrame(repos.values())
print len(repos_table)
repos_table.head()

857


Unnamed: 0,created,fork,forked_from,full_name,name,owner_id,repo_id
0,2016-08-01 01:56:29,True,53949203.0,thedang/Choices,Choices,8512497,64627212
1,2016-10-18 13:34:50,True,65116637.0,Widea/roll-call,roll-call,1750248,71251692
2,2017-02-19 06:40:59,True,64702298.0,tskarthikeyann/home_surveillance,home_surveillance,8179433,82440697
3,2016-08-19 00:33:16,True,53949203.0,warplat/Choices,Choices,9009452,66039958
4,2017-04-03 21:26:01,True,64702298.0,baalsaawe/home_surveillance,home_surveillance,13794487,87122397


## Getting the users from repos and their forks

In [94]:
users = {}
for repo in repos.keys():
    for user in repo.get_contributors():
        users[user] = {'user_id':user.id, 'login':user.login, 'repo_id':repo.id}

In [95]:
contributors_table = pd.DataFrame(users.values())
print len(contributors_table)
contributors_table.head()

7148


Unnamed: 0,login,repo_id,user_id
0,vknabel,78084667,2100336
1,nubbel,78084667,118781
2,FilipZawada,78084667,1159454
3,timusus,88264079,4422616
4,wzs,78084667,599145


### Saving the repo and users data structures

In [102]:
# pickle.dump( repos, open( "repos_ds.p", "wb" ) )
# pickle.dump( users, open( "users_ds.p", "wb" ) )

## Adding the repo and contributors into a database

In [103]:
con = sql.connect('github.db')

In [104]:
repos_table.to_sql(name='repos', con=con, index=False)
contributors_table.to_sql(name='users', con=con, index=False)
con.close()

## Output a csv for using in BigQuery to retrieve all events for all repos

In [106]:
repos_table.to_csv('repos_table.csv', index=False)