# GitHub Repo Selection Process

In [1]:
import pandas as pd
import random
from github import Github
import pickle
import sqlite3 as sql
from datetime import datetime
import time

g = Github('nicholas-alonzo', '')

## Root Repo Conditions

Sample ~100 repos with the following conditions

- Is not by an organization
- Created on 2016-01-01 and/or later
- Is in English (manually check)
- Has no activity before 2016-01-01 (manually check their activity on GitHub)
- Has at least 2 contributors to date
- Has at least 50 forks to date
- Size of repo is >= 10 KB
- Has issues
- is not a github.io project
- Has a language
- Has at least 100 stars to date (a way of showing "popularity")
- The repo was updated sometime after 2016-12-31 (shows its still in use)

In [78]:
# THIS WAS RUN MAY 11TH 2017 ~ 4:26 PM PST
# NOTE: this will return the first 1000 results as stated in the search API documentation 
# however we use the slicing as asking for more than 1000 will return an error 
# Also note that the sort updated desc will give us repos that were updated as of today
repos = g.search_repositories(query='created:>=2016-01-01 size:>=10 forks:>=100 stars:>=100', sort='updated', order='desc')

reposL = []
for repo in repos[:1000]:
    reposL.append(repo)

In [4]:
len(reposL)

900

In [8]:
repos_subL = []

for repo in reposL:
    user_count = 0
    cond1 = repo.organization
    cond2 = repo.pushed_at >= pd.to_datetime('2016-12-31')
    cond3 = repo.language
    cond4 = repo.has_issues
    if not cond1 and cond2 and cond3 and cond4:
        for user in repo.get_contributors():
            user_count += 1
            if user_count >= 2:
                repos_subL.append(repo)
                break
            else:
                pass
    else:
        pass

In [11]:
len(repos_subL)

301

### Save the potential repos 

In [12]:
# pickle.dump( repos_subL, open( "repos.p", "wb" ) )

L = pickle.load( open( "repos.p", "rb" ) )
len(L)

301

### Make sure no repos are repeated

In [13]:
S = set()
for repos in L:
    S.add(repos.full_name)
len(S)

284

## Sample 100 repos

Here we'll have to check maunally that the repos
- are in English
- have no activity before 2016-01-01

### Run 1

In [14]:
random.seed(32824)
rsamp = random.sample(S, 100)

for repo in rsamp:
    print 'https://github.com/'+ repo + '/graphs/contributors'

https://github.com/mchristopher/PokemonGo-DesktopMap/graphs/contributors
https://github.com/Medicean/VulApps/graphs/contributors
https://github.com/ashqal/MD360Player4iOS/graphs/contributors
https://github.com/pchmn/MaterialChipsInput/graphs/contributors
https://github.com/eliangcs/http-prompt/graphs/contributors
https://github.com/a466350665/smart/graphs/contributors
https://github.com/sindresorhus/refined-github/graphs/contributors
https://github.com/florent37/DiagonalLayout/graphs/contributors
https://github.com/endernewton/tf-faster-rcnn/graphs/contributors
https://github.com/DaveGamble/cJSON/graphs/contributors
https://github.com/Codeido/PMAlertController/graphs/contributors
https://github.com/mvantellingen/python-zeep/graphs/contributors
https://github.com/aritraroy/UltimateAndroidReference/graphs/contributors
https://github.com/ciar4n/imagehover.css/graphs/contributors
https://github.com/fenbf/AwesomePerfCpp/graphs/contributors
https://github.com/jopohl/urh/graphs/contributors
h

From checking manually, we dont include the following 

- Medicean/VulApps
- ashqal/MD360Player4Android
- a466350665/smart
- DaveGamble/cJSON
- sjdy521/Mojo-Weixin
- lzjun567/zhihu-api
- nacker/LZEasemob3
- Urinx/WeixinBot
- jaywcjlove/linux-command
- Yorko/mlcourse_open
- viosey/hexo-theme-material
- liuwons/wxBot
- tencent-wechat/phxsql
- photonstorm/phaser-ce
- ashqal/MD360Player4Android
- Binaryify/NeteaseCloudMusicApi
- sohutv/cachecloud
- sumory/orange
- airyland/vux
- hwdsl2/setup-ipsec-vpn
- arterli/CmsWing
- jaywcjlove/awesome-mac
- xiongwilee/koa-grace
- fancymax/12306ForMac
- jeasonlzy/NineGridView
- jinfagang/weibo_terminater
- codeestX/GeekNews
- summerblue/phphub5
- dgrtwo/tidy-text-mining
- BrikerMan/BMPlayer
- wufeifei/cobra
- yscoder/hexo-theme-indigo
- zhaohaodang/vue-WeChat

In [15]:
run1_exclude = ['Medicean/VulApps', 'ashqal/MD360Player4Android', 'a466350665/smart', 
                'DaveGamble/cJSON', 'sjdy521/Mojo-Weixin', 'lzjun567/zhihu-api', 'nacker/LZEasemob3',
                'Urinx/WeixinBot', 'jaywcjlove/linux-command', 'Yorko/mlcourse_open', 'viosey/hexo-theme-material',
                'liuwons/wxBot', 'tencent-wechat/phxsql', 'photonstorm/phaser-ce', 'ashqal/MD360Player4Android'
                'Binaryify/NeteaseCloudMusicApi', 'sohutv/cachecloud', 'sumory/orange', 'airyland/vux',
                'hwdsl2/setup-ipsec-vpn','arterli/CmsWing','jaywcjlove/awesome-mac','xiongwilee/koa-grace',
                'fancymax/12306ForMac','jeasonlzy/NineGridView','jinfagang/weibo_terminater',
                'codeestX/GeekNews','summerblue/phphub5','dgrtwo/tidy-text-mining',
                'BrikerMan/BMPlayer','wufeifei/cobra','yscoder/hexo-theme-indigo','zhaohaodang/vue-WeChat']

include = filter(lambda x: x not in run1_exclude, rsamp)

In [16]:
len(include)

69

In [17]:
S_final = set()

for repo in include:
    S_final.add(repo)

### Run 2

In [18]:
random.seed(32)
rsamp2 = filter(lambda x: x not in S_final, random.sample(S, 31))

for repo in rsamp2:
    print 'https://github.com/'+ repo + '/graphs/contributors'

https://github.com/sherxon/AlgoDS/graphs/contributors
https://github.com/SimulatedGREG/electron-vue/graphs/contributors
https://github.com/pandolia/qqbot/graphs/contributors
https://github.com/SergioBenitez/Rocket/graphs/contributors
https://github.com/vanniktech/Emoji/graphs/contributors
https://github.com/liudongmiao/Brevent/graphs/contributors
https://github.com/yoshuawuyts/choo/graphs/contributors
https://github.com/ybq/Android-SpinKit/graphs/contributors
https://github.com/learncodeacademy/react-js-tutorials/graphs/contributors
https://github.com/viosey/hexo-theme-material/graphs/contributors
https://github.com/alexellis/faas/graphs/contributors
https://github.com/tqchen/tinyflow/graphs/contributors
https://github.com/KunalKapadia/express-mongoose-es6-rest-api/graphs/contributors
https://github.com/tencent-wechat/phxpaxos/graphs/contributors
https://github.com/kpwn/iOSRE/graphs/contributors
https://github.com/keon/algorithms/graphs/contributors
https://github.com/nacker/LZEasemob3

From checking manually, we dont include the following 

- pandolia/qqbot
- viosey/hexo-theme-material
- tencent-wechat/phxpaxos
- nacker/LZEasemob3

In [19]:
run2_exclude = ['pandolia/qqbot', 'viosey/hexo-theme-material','tencent-wechat/phxpaxos','nacker/LZEasemob3']

include2 = filter(lambda x: x not in run2_exclude, rsamp2)

for repos in include2:
    S_final.add(repos)

### Run 3

In [20]:
random.seed(33)
rsamp3 = filter(lambda x: x not in S_final, random.sample(S, 13))

for repo in rsamp3:
    print 'https://github.com/'+ repo + '/graphs/contributors'

https://github.com/Miserlou/Zappa/graphs/contributors
https://github.com/wkh237/react-native-fetch-blob/graphs/contributors
https://github.com/uraimo/Awesome-Swift-Playgrounds/graphs/contributors
https://github.com/fechanique/cordova-plugin-fcm/graphs/contributors
https://github.com/We5ter/Scanners-Box/graphs/contributors
https://github.com/kdchang/reactjs101/graphs/contributors
https://github.com/xubinux/xbin-store/graphs/contributors
https://github.com/eggswift/ESTabBarController/graphs/contributors
https://github.com/nosir/cleave.js/graphs/contributors


From checking manually, we dont include the following

- We5ter/Scanners-Box
- kdchang/reactjs101
- xubinux/xbin-store
- eggswift/ESTabBarController

In [21]:
run3_exclude = ['We5ter/Scanners-Box', 'kdchang/reactjs101', 'xubinux/xbin-store', 'eggswift/ESTabBarController']

include3 = filter(lambda x: x not in run3_exclude, rsamp3)

for repos in include3:
    S_final.add(repos)

### Run 4

In [22]:
random.seed(123213)
rsamp4 = filter(lambda x: x not in S_final, random.sample(S, 8))

for repo in rsamp4:
    print 'https://github.com/'+ repo + '/graphs/contributors'

https://github.com/AeonLucid/POGOProtos/graphs/contributors
https://github.com/521xueweihan/HelloGitHub/graphs/contributors
https://github.com/picturepan2/spectre/graphs/contributors
https://github.com/sergiokopplin/indigo/graphs/contributors
https://github.com/AWEEKJ/Kiko-plus/graphs/contributors
https://github.com/thelong1EU/SpaceTabLayout/graphs/contributors


From checking manually, we dont include the following

- 521xueweihan/HelloGitHub
- AWEEKJ/Kiko-plus

In [23]:
run4_exclude = ['521xueweihan/HelloGitHub', 'AWEEKJ/Kiko-plus']

include4 = filter(lambda x: x not in run4_exclude, rsamp4)

for repos in include4:
    S_final.add(repos)

### Run 5

In [24]:
random.seed(32342438)
rsamp5 = filter(lambda x: x not in S_final, random.sample(S, 4))

for repo in rsamp5:
    print 'https://github.com/'+ repo + '/graphs/contributors'

https://github.com/transcranial/keras-js/graphs/contributors
https://github.com/atomiks/tippyjs/graphs/contributors
https://github.com/pubkey/rxdb/graphs/contributors


In [25]:
for repos in rsamp5:
    S_final.add(repos)

In [26]:
len(S_final)

99

### Run 6

In [27]:
random.seed(98790)
rsamp6 = filter(lambda x: x not in S_final, random.sample(S, 1))

for repo in rsamp6:
    print 'https://github.com/'+ repo + '/graphs/contributors'

https://github.com/nfmcclure/tensorflow_cookbook/graphs/contributors


In [28]:
for repo in rsamp6:
    S_final.add(repo)

## Final root repos

In [29]:
S_final

{u'AeonLucid/POGOProtos',
 u'BeauNouvelle/FaceAware',
 u'Binaryify/NeteaseCloudMusicApi',
 u'Bottelet/Flarepoint-crm',
 u'BradLarson/GPUImage2',
 u'BurntSushi/ripgrep',
 u'CharlesShang/FastMaskRCNN',
 u'Codeido/PMAlertController',
 u'EyreFree/EFQRCode',
 u'FezVrasta/popper.js',
 u'JeffreyWay/laravel-mix',
 u'KunalKapadia/express-mongoose-es6-rest-api',
 u'LukyVj/family.scss',
 u'Maratyszcza/NNPACK',
 u'Miserlou/Zappa',
 u'Plailect/Guide',
 u'Roshanjossey/first-contributions',
 u'SergioBenitez/Rocket',
 u'SimulatedGREG/electron-vue',
 u'TadasBaltrusaitis/OpenFace',
 u'VoLuong/Awesome-Linux-Software',
 u'XProger/OpenLara',
 u'afollestad/polar-dashboard',
 u'alexellis/faas',
 u'amitshekhariitbhu/Android-Debug-Database',
 u'amitshekhariitbhu/Fast-Android-Networking',
 u'aritraroy/UltimateAndroidReference',
 u'ashqal/MD360Player4iOS',
 u'atomiks/tippyjs',
 u'aurelhubert/ahbottomnavigation',
 u'carlos8f/zenbot',
 u'ciar4n/imagehover.css',
 u'cookkkie/mee6',
 u'daijifeng001/R-FCN',
 u'dheraul

In [30]:
final_repos = []
for repos in S_final:
    final_repos.append(g.get_repo(str(repos), lazy=False))
    
final_repos

[Repository(full_name="cookkkie/mee6"),
 Repository(full_name="ro31337/libretaxi"),
 Repository(full_name="krzysztofzablocki/Sourcery"),
 Repository(full_name="terkelg/ramme"),
 Repository(full_name="kevinhughes27/TensorKart"),
 Repository(full_name="fenbf/AwesomePerfCpp"),
 Repository(full_name="fechanique/cordova-plugin-fcm"),
 Repository(full_name="soeaver/caffe-model"),
 Repository(full_name="afollestad/polar-dashboard"),
 Repository(full_name="ewhal/nyaa"),
 Repository(full_name="garretyoder/Colorful"),
 Repository(full_name="keon/algorithms"),
 Repository(full_name="ianstormtaylor/slate"),
 Repository(full_name="pubkey/rxdb"),
 Repository(full_name="huttarichard/instagram-private-api"),
 Repository(full_name="Plailect/Guide"),
 Repository(full_name="notwaldorf/tiny-care-terminal"),
 Repository(full_name="ivpusic/react-native-image-crop-picker"),
 Repository(full_name="aritraroy/UltimateAndroidReference"),
 Repository(full_name="tqchen/tinyflow"),
 Repository(full_name="logaretm/v

In [104]:
# save the final_repos for reference
# pickle.dump( final_repos, open( "final_repos.p", "wb" ) )

## Requesting root repos and their forks

In [None]:
# load up the final_repos list
# final_repos = pickle.load( open( "final_repos.p", "rb" ) )

In [2]:
# dictionary data structure
# repos_ds = {}

In [None]:
# load up the repos data structure to continue processing
# repos_ds = pickle.load( open( "repos_ds.p", "rb" ) )

In [230]:
for repo in final_repos:
    
    # check how far we're into the list
    print repo.full_name
    
    if repo.full_name not in repos_ds:
        repos_ds[repo.full_name] = {'repo':repo, 'repo_id': repo.id, 'full_name': repo.full_name, 'name': repo.name, 
                                    'created': repo.created_at, 'fork': repo.fork, 'owner_id': repo.owner.id, 
                                    'forked_from':None, 'url':repo.html_url, 'stargazers':repo.stargazers_count, 
                                    'watchers':repo.subscribers_count, 'forks':repo.forks_count, 
                                    'description':repo.description, 'language':repo.language, 
                                    'time_requested':datetime.now()}
    else:
        pass
    
    for fork in repo.get_forks():
        
        # delay time to get back the 5000 request limit per hour
        if g.get_rate_limit().rate.remaining < 2:
            print 'Going to pause for an hour now'
            time.sleep(3600)
    
        if fork.full_name not in repos_ds:
            repos_ds[fork.full_name] = {'repo':fork, 'repo_id': fork.id, 'full_name': fork.full_name, 
                                        'name': fork.name, 'created': fork.created_at, 'fork': fork.fork, 
                                        'owner_id': fork.owner.id, 'forked_from':repo.id, 'url':fork.html_url, 
                                        'stargazers':fork.stargazers_count, 'watchers':fork.subscribers_count, 
                                        'forks':fork.forks_count, 'description':fork.description, 
                                        'language':fork.language, 'time_requested':datetime.now()}
        else:
            pass

picturepan2/spectre
nosir/cleave.js
yoshuawuyts/choo
alexellis/faas
EyreFree/EFQRCode
FezVrasta/popper.js
vanniktech/Emoji
Maratyszcza/NNPACK
mhinz/vim-galore
CharlesShang/FastMaskRCNN


In [4]:
repos_table.head()

Unnamed: 0,created,description,fork,forked_from,forks,full_name,language,name,owner_id,repo,repo_id,stargazers,time_requested,url,watchers
0,2017-05-11 18:38:08,🛠️⚡ Step-by-step tutorial to build a modern J...,True,69798748.0,0,glongh/js-stack-from-scratch,JavaScript,js-stack-from-scratch,1266111,"Repository(full_name=""glongh/js-stack-from-scr...",91012066,0,2017-05-14 18:16:41.037042,https://github.com/glongh/js-stack-from-scratch,1
1,2017-04-10 17:00:42,Implementation of Algorithms and Data Structur...,True,76670734.0,0,EverettZ/AlgoDS,Java,AlgoDS,2085059,"Repository(full_name=""EverettZ/AlgoDS"")",87836112,0,2017-05-14 18:00:19.066756,https://github.com/EverettZ/AlgoDS,1
2,2016-12-30 13:22:59,A Wysiwyg editor build on top of ReactJS and D...,True,67361765.0,1,SHOFLO/react-draft-wysiwyg,JavaScript,react-draft-wysiwyg,2453842,"Repository(full_name=""SHOFLO/react-draft-wysiw...",77687579,0,2017-05-14 19:07:46.407355,https://github.com/SHOFLO/react-draft-wysiwyg,6
3,2016-07-28 19:00:20,Electron App around PokemonGo-Map,True,63730796.0,0,Flapfc/PokemonGo-DesktopMap,Python,PokemonGo-DesktopMap,20688686,"Repository(full_name=""Flapfc/PokemonGo-Desktop...",64420851,0,2017-05-14 19:05:07.357477,https://github.com/Flapfc/PokemonGo-DesktopMap,0
4,2016-05-24 15:43:51,"Spectre.css - a lightweight, responsive and mo...",True,53321815.0,0,hawkapparel/spectre,HTML,spectre,8813795,"Repository(full_name=""hawkapparel/spectre"")",59586443,0,2017-05-15 01:01:40.864103,https://github.com/hawkapparel/spectre,1


In [233]:
# save the repo data for reference
# pickle.dump( repos_ds, open( "repos_ds.p", "wb" ) )

### Accidentally added a couple of repos not in English; Remove
(geeeeeeeeek/electronic-wechat)  
(smallnest/rpcx)  
(ashqal/MD360Player4iOS)  

Noticed this while requesting repo data, specifically the issue events are not in English

In [2]:
repos_ds = pickle.load( open( "repos_ds.p", "rb" ) )
repos_table = pd.DataFrame(repos_ds.values())

In [3]:
rep1 = repos_table['full_name'] == 'geeeeeeeeek/electronic-wechat'
rep2 = repos_table['full_name'] == 'smallnest/rpcx'
rep3 = repos_table['full_name'] == 'ashqal/MD360Player4iOS'

repos_table[rep1 | rep2 | rep3]

Unnamed: 0,created,description,fork,forked_from,forks,full_name,language,name,owner_id,repo,repo_id,stargazers,time_requested,url,watchers
1321,2016-05-18 09:34:05,A RPC service framework based on net/rpc like ...,False,,196,smallnest/rpcx,Go,rpcx,865763,"Repository(full_name=""smallnest/rpcx"")",59101986,1051,2017-05-14 20:22:11.810506,https://github.com/smallnest/rpcx,124
8068,2016-02-18 09:09:53,:speech_balloon: A better WeChat on macOS and ...,False,,1208,geeeeeeeeek/electronic-wechat,JavaScript,electronic-wechat,7262715,"Repository(full_name=""geeeeeeeeek/electronic-w...",51994692,6996,2017-05-14 19:53:35.968902,https://github.com/geeeeeeeeek/electronic-wechat,369
20001,2016-04-13 16:49:27,It is a lite library to render 360 degree pano...,False,,154,ashqal/MD360Player4iOS,Objective-C,MD360Player4iOS,5126517,"Repository(full_name=""ashqal/MD360Player4iOS"")",56169643,583,2017-05-14 18:04:55.539736,https://github.com/ashqal/MD360Player4iOS,48


In [5]:
cond1 = repos_table['repo_id'] == 51994692
cond2 = repos_table['forked_from'] == 51994692

cond3 = repos_table['repo_id'] == 59101986
cond4 = repos_table['forked_from'] == 59101986

cond5 = repos_table['repo_id'] == 56169643
cond6 = repos_table['forked_from'] == 56169643

repos_table = repos_table[~cond1 & ~cond2 & ~cond3 & ~cond4 & ~cond5 & ~cond6]

In [6]:
repos_table.shape

(23851, 15)

### Output a csv for using in BigQuery to retrieve all events for all repos

In [8]:
# repos_output = repos_table[['full_name', 'repo_id']]
# repos_output.to_csv('repos_table.csv', index=False)

### Formatting repos data frame in a cleaner mode

In [7]:
# cant have a python class in SQL DB
repos_table = repos_table.drop('repo', axis=1)

# forked_from gives same result
repos_table = repos_table.drop('fork', axis=1)

# change to ascii characters
repos_table['description'] = repos_table['description'].str.encode('ascii', 'ignore')

In [8]:
# change names of some columns
repos_table = repos_table.rename(columns={'time_requested':'requested_at', 'forks':'fork_count',
                                         'stargazers':'stargazers_count', 'watchers':'watchers_count',
                                         'created': 'created_at'})

# reorder DF and sort index
reorder_cols = ['repo_id', 'owner_id', 'forked_from', 'created_at', 'requested_at',
               'url', 'name', 'full_name', 'description', 'language', 'fork_count', 'watchers_count',
               'stargazers_count']

repos_table = repos_table[reorder_cols]

In [9]:
repos_table.head()

Unnamed: 0,repo_id,owner_id,forked_from,created_at,requested_at,url,name,full_name,description,language,fork_count,watchers_count,stargazers_count
0,91012066,1266111,69798748.0,2017-05-11 18:38:08,2017-05-14 18:16:41.037042,https://github.com/glongh/js-stack-from-scratch,js-stack-from-scratch,glongh/js-stack-from-scratch,Step-by-step tutorial to build a modern JavaS...,JavaScript,0,1,0
1,87836112,2085059,76670734.0,2017-04-10 17:00:42,2017-05-14 18:00:19.066756,https://github.com/EverettZ/AlgoDS,AlgoDS,EverettZ/AlgoDS,Implementation of Algorithms and Data Structur...,Java,0,1,0
2,77687579,2453842,67361765.0,2016-12-30 13:22:59,2017-05-14 19:07:46.407355,https://github.com/SHOFLO/react-draft-wysiwyg,react-draft-wysiwyg,SHOFLO/react-draft-wysiwyg,A Wysiwyg editor build on top of ReactJS and D...,JavaScript,1,6,0
3,64420851,20688686,63730796.0,2016-07-28 19:00:20,2017-05-14 19:05:07.357477,https://github.com/Flapfc/PokemonGo-DesktopMap,PokemonGo-DesktopMap,Flapfc/PokemonGo-DesktopMap,Electron App around PokemonGo-Map,Python,0,0,0
4,59586443,8813795,53321815.0,2016-05-24 15:43:51,2017-05-15 01:01:40.864103,https://github.com/hawkapparel/spectre,spectre,hawkapparel/spectre,"Spectre.css - a lightweight, responsive and mo...",HTML,0,1,0


## Requesting contributors for all repos that are to date

In [5]:
# all the repos to loop over
repos_ids = repos_table['repo'].tolist()

# contributors_ds = {}
contributors_ds = pickle.load( open( "contributors_ds.p", "rb" ) )

# repos where we could not get info from
L_badrepos = []

In [None]:
count = 0

for repo in repos_ids:    
    count += 1
    print count
    
    # some forked repos may not exist anymore
    try:
        for user in repo.get_contributors():
            
            key = (repo.id, user.id)
            
            if g.get_rate_limit().rate.remaining < 2:
                print 'Going to pause for 30 mins'
                time.sleep(1800)

            if key not in contributors_ds:
                contributors_ds[key] = {'user_id':user.id, 'login':user.login, 
                                         'repo_id':repo.id, 'requested_at':datetime.now()}
            else:
                pass
    except:
        L_badrepos.append(repo)

In [11]:
# save the contributors data structure for reference
# pickle.dump( contributors_ds, open( "contributors_ds.p", "wb" ) )

### Reading in contributors data structure

In [10]:
repos_ids = repos_table['repo_id'].tolist()

contributors_ds = pickle.load( open( "contributors_ds.p", "rb" ) )

contributors_table = pd.DataFrame(contributors_ds.values())
print len(contributors_table)
contributors_table.head()

301692


Unnamed: 0,login,repo_id,requested_at,user_id
0,ralphite,87380276,2017-05-18 09:12:47.088095,3046804
1,DaveVoyles,78902398,2017-05-17 11:37:55.193568,1786053
2,meain,82225757,2017-05-18 04:29:37.883319,14259816
3,pguth,73213893,2017-05-17 22:20:47.311123,85259
4,jsarafajr,73811823,2017-05-17 20:16:48.401585,4338228


In [13]:
# get all the correct repo ids
contributors_table = contributors_table[contributors_table.repo_id.isin(repos_ids)]

### Formatting contributors data frame in a cleaner mode

In [14]:
# change names of some columns
contributors_table = contributors_table.rename(columns={'user_id':'contributor_id'})

# reorder DF and sort index
reorder_cols = ['repo_id', 'contributor_id', 'login', 'requested_at']

contributors_table = contributors_table[reorder_cols]

In [15]:
contributors_table.head()

Unnamed: 0,repo_id,contributor_id,login,requested_at
0,87380276,3046804,ralphite,2017-05-18 09:12:47.088095
1,78902398,1786053,DaveVoyles,2017-05-17 11:37:55.193568
2,82225757,14259816,meain,2017-05-18 04:29:37.883319
3,73213893,85259,pguth,2017-05-17 22:20:47.311123
4,73811823,4338228,jsarafajr,2017-05-17 20:16:48.401585


## Inserting repo and contributors table into a database

In [17]:
repos_table.head()

Unnamed: 0,repo_id,owner_id,forked_from,created_at,requested_at,url,name,full_name,description,language,fork_count,watchers_count,stargazers_count
0,91012066,1266111,69798748.0,2017-05-11 18:38:08,2017-05-14 18:16:41.037042,https://github.com/glongh/js-stack-from-scratch,js-stack-from-scratch,glongh/js-stack-from-scratch,Step-by-step tutorial to build a modern JavaS...,JavaScript,0,1,0
1,87836112,2085059,76670734.0,2017-04-10 17:00:42,2017-05-14 18:00:19.066756,https://github.com/EverettZ/AlgoDS,AlgoDS,EverettZ/AlgoDS,Implementation of Algorithms and Data Structur...,Java,0,1,0
2,77687579,2453842,67361765.0,2016-12-30 13:22:59,2017-05-14 19:07:46.407355,https://github.com/SHOFLO/react-draft-wysiwyg,react-draft-wysiwyg,SHOFLO/react-draft-wysiwyg,A Wysiwyg editor build on top of ReactJS and D...,JavaScript,1,6,0
3,64420851,20688686,63730796.0,2016-07-28 19:00:20,2017-05-14 19:05:07.357477,https://github.com/Flapfc/PokemonGo-DesktopMap,PokemonGo-DesktopMap,Flapfc/PokemonGo-DesktopMap,Electron App around PokemonGo-Map,Python,0,0,0
4,59586443,8813795,53321815.0,2016-05-24 15:43:51,2017-05-15 01:01:40.864103,https://github.com/hawkapparel/spectre,spectre,hawkapparel/spectre,"Spectre.css - a lightweight, responsive and mo...",HTML,0,1,0


In [18]:
repos_cols = ['repo_id', 'owner_id', 'forked_from', 'fork_count', 'watchers_count', 'stargazers_count']
repos_table[repos_cols] = repos_table[repos_cols].applymap(pd.to_numeric)

In [16]:
contributors_table.head()

Unnamed: 0,repo_id,contributor_id,login,requested_at
0,87380276,3046804,ralphite,2017-05-18 09:12:47.088095
1,78902398,1786053,DaveVoyles,2017-05-17 11:37:55.193568
2,82225757,14259816,meain,2017-05-18 04:29:37.883319
3,73213893,85259,pguth,2017-05-17 22:20:47.311123
4,73811823,4338228,jsarafajr,2017-05-17 20:16:48.401585


In [20]:
contributors_cols = ['repo_id', 'contributor_id']
contributors_table[contributors_cols] = contributors_table[contributors_cols].applymap(pd.to_numeric)

In [21]:
# create a SQLite DB
con = sql.connect('github.db')
c = con.cursor()

In [22]:
create_repos_table = '''
CREATE TABLE repos (
    repo_id          NUMERIC      NOT NULL,
    owner_id         NUMERIC      NOT NULL,
    forked_from      NUMERIC,
    created_at       TEXT         NOT NULL,
    requested_at     TEXT         NOT NULL,
    url              VARCHAR (60) NOT NULL,
    name             VARCHAR (40) NOT NULL,
    full_name        VARCHAR (50) NOT NULL,
    description      TEXT,
    language         VARCHAR (40),
    fork_count       INT,
    watchers_count   INT,
    stargazers_count INT,
    PRIMARY KEY (
        repo_id,
        owner_id
    )
)
WITHOUT ROWID;'''

create_contributors_table = '''
CREATE TABLE contributors (
    repo_id        NUMERIC      NOT NULL,
    contributor_id NUMERIC      NOT NULL,
    login          VARCHAR (40) NOT NULL,
    requested_at   TEXT         NOT NULL,
    PRIMARY KEY (
        repo_id,
        contributor_id
    )
)
WITHOUT ROWID;'''

In [23]:
c.execute(create_repos_table)
c.execute(create_contributors_table)
con.commit()

In [24]:
repos_table.to_sql(name='repos', con=con, if_exists='append', index=False)
contributors_table.to_sql(name='contributors', con=con, if_exists='append', index=False)
con.close()