In [1]:
import numpy as np
import pandas as pd
import requests as r
import math

In [2]:
TOKEN = 'your_personal_access_token'

In [3]:
BASE_URL = "https://api.github.com"
HEADERS = {
    'Accept': 'application/vnd.github+json',
    'Authorization': f'Bearer {TOKEN}'
}

In [4]:
dublin_users = []
dublic_users_url = f"{BASE_URL}/search/users?q=location:Dublin+followers:>50"
total_dublin_users = 0
page = 1
per_page = 30

dublic_users_resp = r.get(dublic_users_url, headers=HEADERS)

if dublic_users_resp.status_code == 200:
  users = dublic_users_resp.json()
  dublin_users.extend(users['items'])
  if users['total_count'] > per_page:
    total_dublin_users = users['total_count']
    no_pages = math.ceil(total_dublin_users / per_page)
    page += 1

    while no_pages >= page:
      dublin_users_paginated_url = f"{dublic_users_url}&per_page={per_page}&page={page}"
      dublic_users_resp = r.get(dublin_users_paginated_url, headers=HEADERS)

      if dublic_users_resp.status_code == 200:
        users = dublic_users_resp.json()
        dublin_users.extend(users['items'])
      else:
        print(dublic_users_resp.status_code)
        break
      page += 1
else:
  print(dublic_users_resp.status_code)

In [8]:
dublin_users_url_df = pd.DataFrame(dublin_users, columns=['login', 'url'])

In [9]:
dublin_users_url_df.to_csv('dublin_users_url.csv')

In [10]:
users_data = []
users_repos = {}

for user_url in dublin_users_url_df['url']:
  resp = r.get(user_url, headers=HEADERS)

  if resp.status_code == 200:
    user_data = resp.json()
    users_data.append(user_data)
    page = 1
    per_page = 30
    total_public_repos = user_data['public_repos']
    if user_data['public_repos'] > 500:
      total_public_repos = 500
    no_pages = math.ceil(total_public_repos/per_page)

    while no_pages >= page:
      resp = r.get(f"{user_data['repos_url']}?sort=created_at&per_page={per_page}&page={page}", headers=HEADERS)

      if resp.status_code == 200:
        user_repos_data = resp.json()
        users_repos[user_data['login']] = user_repos_data
      else:
        print(resp.status_code)
        break
      page += 1
  else:
    print(resp.status_code)
    break

In [11]:
len(users_data)

477

In [111]:
len(users_repos)

470

In [122]:
users_df = pd.DataFrame(users_data, columns=['login', 'name', 'company', 'location', 'email', 'hireable', 'bio', 'public_repos', 'followers', 'following', 'created_at'])

In [112]:
for user in users_df['login']:
  if user not in users_repos:
    print(r.get(f"https://api.github.com/users/{user}", headers=HEADERS).json().get('public_repos'))

0
0
0
0
0
0
0


In [114]:
users_df.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,orta,Orta Therox,,Huddersfield / NYC / Dublin / Rio de Janeiro,git@orta.io,,Tech on @puzzmo-com \r\n\r\nEx-TypeScript. Con...,952,6017,109,2009-01-24T20:40:31Z
1,jeromeetienne,Jerome Etienne,Making WebAR a reality,"Dublin, Ireland",jerome.etienne@gmail.com,True,Making WebAR a reality - Around Javascript and...,301,2780,15,2010-04-26T11:58:29Z
2,jonataslaw,Jonny Borges,Iris,"Dublin, Ireland",,True,VP of Engineering from Iris Finance.\r\nDevelo...,272,2692,17,2018-01-23T19:17:15Z
3,steventroughtonsmith,Steven Troughton-Smith,High Caffeine Content,"Dublin, Ireland",,,,98,1984,4,2009-01-08T23:51:31Z
4,axic,Alex Beregszaszi,@ethereum @ipsilon @spearbit @ethereumjs,"Dublin, Ireland",,True,Works on decentralised stuff.\r\n\r\neipnft:ax...,143,1807,5,2008-08-11T23:38:10Z


In [124]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 477 entries, 0 to 476
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   login         477 non-null    object
 1   name          470 non-null    object
 2   company       315 non-null    object
 3   location      477 non-null    object
 4   email         235 non-null    object
 5   hireable      182 non-null    object
 6   bio           325 non-null    object
 7   public_repos  477 non-null    int64 
 8   followers     477 non-null    int64 
 9   following     477 non-null    int64 
 10  created_at    477 non-null    object
dtypes: int64(3), object(8)
memory usage: 41.1+ KB


In [125]:
users_df['created_at'] = pd.to_datetime(users_df['created_at'], format='%Y-%m-%dT%H:%M:%SZ')

In [126]:
users_df = users_df.fillna({
    'name': '',
    'company': '',
    'email': '',
    'hireable': '',
    'bio': ''
})

In [127]:
users_df['hireable'] = users_df['hireable'].replace({True: 'true', False: 'false'})

In [128]:
users_df['company'] = users_df['company'].apply(lambda x : x.strip().lstrip('@').upper())

In [129]:
users_df['bio'] = users_df['bio'].apply(lambda x: x.strip())

In [317]:
users_df.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at,leader_strength
0,orta,Orta Therox,,Huddersfield / NYC / Dublin / Rio de Janeiro,git@orta.io,,Tech on @puzzmo-com \r\n\r\nEx-TypeScript. Con...,952,6017,109,2009-01-24 20:40:31,54.7
1,jeromeetienne,Jerome Etienne,MAKING WEBAR A REALITY,"Dublin, Ireland",jerome.etienne@gmail.com,True,Making WebAR a reality - Around Javascript and...,301,2780,15,2010-04-26 11:58:29,173.75
2,jonataslaw,Jonny Borges,IRIS,"Dublin, Ireland",,True,VP of Engineering from Iris Finance.\r\nDevelo...,272,2692,17,2018-01-23 19:17:15,149.555556
3,steventroughtonsmith,Steven Troughton-Smith,HIGH CAFFEINE CONTENT,"Dublin, Ireland",,,,98,1984,4,2009-01-08 23:51:31,396.8
4,axic,Alex Beregszaszi,ETHEREUM @IPSILON @SPEARBIT @ETHEREUMJS,"Dublin, Ireland",,True,Works on decentralised stuff.\r\n\r\neipnft:ax...,143,1807,5,2008-08-11 23:38:10,301.166667


In [131]:
users_df.to_csv('users.csv', index=False)

In [51]:
all_repos = []
for user in users_repos:
  for repo in users_repos[user]:
    repo_dict = {}
    repo_dict['login'] = repo.get('owner').get('login')
    repo_dict['full_name'] = repo.get('full_name')
    repo_dict['created_at'] = repo.get('created_at')
    repo_dict['stargazers_count'] = repo.get('stargazers_count')
    repo_dict['watchers_count'] = repo.get('watchers_count')
    repo_dict['language'] = repo.get('language')
    repo_dict['has_projects'] = repo.get('has_projects')
    repo_dict['has_wiki'] = repo.get('has_wiki')
    if repo.get('license') is None:
      repo_dict['license_name'] = None
    else:
      repo_dict['license_name'] = repo.get('license').get('name')

    all_repos.append(repo_dict)

In [48]:
len(all_repos)

7208

In [49]:
total = 0
for user in users_repos:
  total += len(users_repos[user])

print(total)

7208


In [132]:
repos_df = pd.DataFrame(all_repos)

In [133]:
repos_df.shape

(7208, 9)

In [134]:
repos_df.columns

Index(['login', 'full_name', 'created_at', 'stargazers_count',
       'watchers_count', 'language', 'has_projects', 'has_wiki',
       'license_name'],
      dtype='object')

In [135]:
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'], format='%Y-%m-%dT%H:%M:%SZ')

In [136]:
repos_df = repos_df.fillna({
    'language': '',
    'license_name': ''
})

In [137]:
repos_df['has_projects'] = repos_df['has_projects'].map({True: 'true', False: 'false'})

In [138]:
repos_df['has_wiki'] = repos_df['has_wiki'].map({True: 'true', False: 'false'})

In [139]:
repos_df

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,orta,orta/typedoc,2017-04-08 17:00:09,0,0,HTML,true,false,Apache License 2.0
1,orta,orta/XVim,2017-04-07 15:25:16,0,0,Objective-C,true,true,MIT License
2,orta,orta/jekyll-lunr-js-search,2017-04-05 07:02:28,0,0,JavaScript,true,true,MIT License
3,orta,orta/coffeelinter,2017-04-04 10:58:30,0,0,TypeScript,true,true,
4,orta,orta/json2ts,2017-03-30 10:46:49,1,1,TypeScript,true,true,MIT License
...,...,...,...,...,...,...,...,...,...
7203,nakitamccool,nakitamccool/Travel-destinations-app,2017-02-03 11:10:36,0,0,TypeScript,true,true,
7204,nakitamccool,nakitamccool/Positioning-and-CSS,2017-02-03 10:31:24,0,0,HTML,true,true,
7205,nakitamccool,nakitamccool/BoxModel2,2017-02-03 10:22:20,0,0,HTML,true,true,
7206,nakitamccool,nakitamccool/BoxModel1,2017-02-03 10:16:10,0,0,HTML,true,true,


In [197]:
repos_df['has_projects'] = repos_df['has_projects'].astype(str)
repos_df['has_wiki'] = repos_df['has_wiki'].astype(str)

In [198]:
repos_df.to_csv('repositories.csv', index=False)

### 1. Who are the top 5 users in Dublin with the highest number of followers? List their login in order, comma-separated.

In [149]:
users_df.sort_values(by='followers', ascending=False).head(5)

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,orta,Orta Therox,,Huddersfield / NYC / Dublin / Rio de Janeiro,git@orta.io,,Tech on @puzzmo-com \r\n\r\nEx-TypeScript. Con...,952,6017,109,2009-01-24 20:40:31
1,jeromeetienne,Jerome Etienne,MAKING WEBAR A REALITY,"Dublin, Ireland",jerome.etienne@gmail.com,True,Making WebAR a reality - Around Javascript and...,301,2780,15,2010-04-26 11:58:29
2,jonataslaw,Jonny Borges,IRIS,"Dublin, Ireland",,True,VP of Engineering from Iris Finance.\r\nDevelo...,272,2692,17,2018-01-23 19:17:15
3,steventroughtonsmith,Steven Troughton-Smith,HIGH CAFFEINE CONTENT,"Dublin, Ireland",,,,98,1984,4,2009-01-08 23:51:31
4,axic,Alex Beregszaszi,ETHEREUM @IPSILON @SPEARBIT @ETHEREUMJS,"Dublin, Ireland",,True,Works on decentralised stuff.\r\n\r\neipnft:ax...,143,1807,5,2008-08-11 23:38:10


### 2. Who are the 5 earliest registered GitHub users in Dublin? List their login in ascending order of created_at, comma-separated.

In [151]:
users_df.sort_values(by='created_at', ascending=True).head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
75,paulca,Paul Campbell,TITO,"Dublin, Ireland",paul@rslw.com,,Founder of @teamtito,106,213,20,2008-02-27 22:50:56
447,adrian,Adrian Smith,WORKDAY,"Dublin, Ireland",adrian@17od.com,,,52,54,10,2008-02-29 20:03:31
85,GavinJoyce,Gavin Joyce,VIDU,"Dublin, Ireland",,,"Building vidu.io ✨. Previously, engineering In...",187,191,8,2008-03-07 17:27:50
74,amir,Amir M. Saeid,,"Dublin, Ireland",amirsaied@gmail.com,True,,82,220,911,2008-04-02 18:46:51
439,ciaranlee,Ciaran Lee,INTERCOM.IO,"Dublin, Ireland",,,,22,55,5,2008-04-04 18:46:06


In [152]:
users_df.sort_values(by='created_at', ascending=True).head()['login'].unique()

array(['paulca', 'adrian', 'GavinJoyce', 'amir', 'ciaranlee'],
      dtype=object)

### 3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

In [283]:
repos_df['license_name'].value_counts().head()

Unnamed: 0_level_0,count
license_name,Unnamed: 1_level_1
,4079
MIT License,1522
Other,484
Apache License 2.0,414
GNU General Public License v3.0,264


### 4. Which company do the majority of these developers work at?

In [316]:
users_df['company'].value_counts()

Unnamed: 0_level_0,count
company,Unnamed: 1_level_1
,162
MICROSOFT,18
AWS,18
GOOGLE,16
AMAZON,8
...,...
ADAPT CENTRE,1
MECHANICAL ORCHARD,1
DORSET COLLEGE,1
TOMTOM,1


### 5. Which programming language is most popular among these users?

In [159]:
repos_df['language'].value_counts()

Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
,1209
JavaScript,1114
Python,911
Java,533
HTML,353
...,...
Solidity,1
Cairo,1
Mustache,1
WebAssembly,1


### 6. Which programming language is the second most popular among users who joined after 2020?

In [None]:
repos_df.loc[repos_df['login'].isin(users_df.loc[users_df['created_at'] > '2020-01-01', 'login']), 'language'].value_counts()

### 7. Which language has the highest average number of stars per repository?

In [165]:
repos_df.groupby('language')['stargazers_count'].mean().sort_values(ascending=False)

Unnamed: 0_level_0,stargazers_count
language,Unnamed: 1_level_1
MDX,13042.000000
CMake,2480.800000
Vim Script,189.000000
TypeScript,176.348259
C,100.669421
...,...
Lex,0.000000
Scilab,0.000000
Smali,0.000000
Solidity,0.000000


### 8. Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated

In [167]:
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

In [169]:
list(users_df.sort_values(by='leader_strength', ascending=False).head()['login'].unique())

['flaviohenriquealmeida', 'zalando', 'AnikSarker', 'wix', 'CardinalHealth']

In [170]:
users_df['company'].value_counts()

Unnamed: 0_level_0,count
company,Unnamed: 1_level_1
,162
MICROSOFT,18
AWS,18
GOOGLE,16
AMAZON,8
...,...
ADAPT CENTRE,1
MECHANICAL ORCHARD,1
DORSET COLLEGE,1
TOMTOM,1


### 9. What is the correlation between the number of followers and the number of public repositories among users in Dublin?

In [172]:
users_df['followers'].corr(users_df['public_repos'], method='pearson')

0.5547995009959515

### 10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

In [174]:
from sklearn.linear_model import LinearRegression

In [175]:
lr = LinearRegression()

In [176]:
lr.fit(users_df[['public_repos']], users_df['followers'])

In [177]:
lr.coef_

array([2.82481832])

In [None]:
# 11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?

In [199]:
temp_df = repos_df[(repos_df['has_projects'] == 'true') & (repos_df['has_wiki'] == 'true')]

In [200]:
temp_df

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
1,orta,orta/XVim,2017-04-07 15:25:16,0,0,Objective-C,true,true,MIT License
2,orta,orta/jekyll-lunr-js-search,2017-04-05 07:02:28,0,0,JavaScript,true,true,MIT License
3,orta,orta/coffeelinter,2017-04-04 10:58:30,0,0,TypeScript,true,true,
4,orta,orta/json2ts,2017-03-30 10:46:49,1,1,TypeScript,true,true,MIT License
6,orta,orta/slack-api,2017-03-29 12:39:19,0,0,JavaScript,true,true,MIT License
...,...,...,...,...,...,...,...,...,...
7203,nakitamccool,nakitamccool/Travel-destinations-app,2017-02-03 11:10:36,0,0,TypeScript,true,true,
7204,nakitamccool,nakitamccool/Positioning-and-CSS,2017-02-03 10:31:24,0,0,HTML,true,true,
7205,nakitamccool,nakitamccool/BoxModel2,2017-02-03 10:22:20,0,0,HTML,true,true,
7206,nakitamccool,nakitamccool/BoxModel1,2017-02-03 10:16:10,0,0,HTML,true,true,


In [201]:
temp_df['has_projects'].corr(temp_df['has_wiki'])

ValueError: could not convert string to float: 'true'

### 12. Do hireable users follow more people than those who are not hireable?

In [270]:
users_df.loc[users_df['hireable'] != 'true', 'following'].mean()

65.52881355932203

In [271]:
users_df.loc[users_df['hireable'] == 'true', 'following'].mean()

112.81318681318682

In [272]:
112.813186813186822 - 65.52881355932203

47.284373253864786

### 13. Some developers write long bios. Does that help them get more followers? What's the impact of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)?

In [216]:
lr.fit(users_df.loc[users_df['bio'] != '', 'bio'].str.strip().str.split().str.len().to_frame(), users_df.loc[users_df['bio'] != '', 'followers'].to_frame())

In [217]:
lr.coef_

array([[7.23892787]])

In [215]:
users_df.loc[users_df['bio'] != '', 'bio'].str.strip().str.split().str.len().to_frame()

Unnamed: 0,bio
0,16
1,22
2,15
4,5
5,13
...,...
470,5
471,22
472,7
474,8


In [222]:
repos_df['weekday'] = repos_df['created_at'].apply(lambda x: x.weekday())

In [282]:
repos_df.loc[repos_df['weekday'].isin([5, 6]), 'login'].value_counts().head(12)

Unnamed: 0_level_0,count
login,Unnamed: 1_level_1
hmikihth,24
joshuacassidy,18
10xOXR,16
yumin-chen,15
sakrist,15
SibeeshVenu,15
ocanty,15
gdyrrahitis,13
prakhargurawa,13
nhtlongcs,13


### 15. Do people who are hireable share their email addresses more often?

In [289]:
users_df[(users_df['email'] != '') & (users_df['hireable'] == 'true')].shape[0] / users_df.shape[0]

0.2138364779874214

In [288]:
users_df[(users_df['email'] != '') & (users_df['hireable'] != 'true')].shape[0] / users_df.shape[0]

0.27882599580712786

In [292]:
round(0.2138364779874214 - 0.27882599580712786, ndigits=3)

-0.065

### 16. Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

In [269]:
users_df['name'].apply(lambda x : x.split()[1] if len(x.split()) == 2 else None).value_counts().head()

Unnamed: 0_level_0,count
name,Unnamed: 1_level_1
Quinn,3
Chen,3
O'Sullivan,3
Kenny,3
Müller,2
