In [1]:
import requests
import csv
import time

# GitHub API token
GITHUB_TOKEN = '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'
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

# Helper function to clean up company names
def clean_company_name(company):
    if company:
        company = company.strip().lstrip('@').upper()
    return company

# Function to fetch users from the GitHub API
def fetch_users(city="Mumbai", min_followers=51):
    users = []
    page = 1

    while True:
        url = f"https://api.github.com/search/users?q=location:{city}+followers:>{min_followers}&page={page}&per_page=100"
        response = requests.get(url, headers=HEADERS)
        data = response.json()

        # Break if no more results
        if 'items' not in data or not data['items']:
            break

        for user in data['items']:
            # Get full user info
            user_url = user['url']
            user_response = requests.get(user_url, headers=HEADERS)
            user_data = user_response.json()

            # Extract required fields
            users.append({
                'login': user_data['login'],
                'name': user_data['name'],
                'company': clean_company_name(user_data['company']),
                'location': user_data['location'],
                'email': user_data['email'],
                'hireable': user_data['hireable'],
                'bio': user_data['bio'],
                'public_repos': user_data['public_repos'],
                'followers': user_data['followers'],
                'following': user_data['following'],
                'created_at': user_data['created_at'],
            })
        page += 1
        time.sleep(1)  # Avoid hitting API rate limits

    return users

# Function to fetch repositories for a user
def fetch_repositories(user_login):
    repositories = []
    page = 1

    while True:
        url = f"https://api.github.com/users/{user_login}/repos?per_page=100&page={page}"
        response = requests.get(url, headers=HEADERS)
        repo_data = response.json()

        # Break if no more repositories
        if not repo_data:
            break

        for repo in repo_data:
            repositories.append({
                'login': user_login,
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo['language'],
                'has_projects': repo['has_projects'],
                'has_wiki': repo['has_wiki'],
                'license_name': repo['license']['key'] if repo['license'] else None,
            })

        # If fewer than 100 repositories are returned, it means we're on the last page
        if len(repo_data) < 100:
            break

        page += 1  # Move to the next page
        time.sleep(1)  # Avoid hitting API rate limits

    return repositories

# Save users to CSV
def save_users_to_csv(users, filename="users.csv"):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=users[0].keys())
        writer.writeheader()
        writer.writerows(users)

# Save repositories to CSV
def save_repositories_to_csv(repositories, filename="repositories.csv"):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=repositories[0].keys())
        writer.writeheader()
        writer.writerows(repositories)

def main():
    print("Fetching users...")
    users = fetch_users()
    save_users_to_csv(users)
    print(f"Saved {len(users)} users to users.csv")

    print("Fetching repositories...")
    all_repositories = []
    for user in users:
        user_repos = fetch_repositories(user["login"])
        all_repositories.extend(user_repos)
        print(f"Fetched {len(user_repos)} repositories for user {user['login']}")

    save_repositories_to_csv(all_repositories)
    print(f"Saved {len(all_repositories)} repositories to repositories.csv")

if __name__ == "__main__":
    main()

Fetching users...
Saved 700 users to users.csv
Fetching repositories...
Fetched 66 repositories for user ValentineFernandes
Fetched 37 repositories for user kovidgoyal
Fetched 113 repositories for user slidenerd
Fetched 88 repositories for user aryashah2k
Fetched 11 repositories for user coding-parrot
Fetched 9 repositories for user gkcs
Fetched 53 repositories for user darshilparmar
Fetched 330 repositories for user Kushal334
Fetched 77 repositories for user ritz078
Fetched 144 repositories for user PrasoonPratham
Fetched 35 repositories for user rmehta
Fetched 69 repositories for user LakshyaDuhoonISU
Fetched 61 repositories for user dmalvia
Fetched 41 repositories for user PiyushKumarSingh-90
Fetched 89 repositories for user mfaisalkhatri
Fetched 34 repositories for user omsandippatil
Fetched 17 repositories for user anujvyas
Fetched 89 repositories for user jalajthanaki
Fetched 81 repositories for user alisolanki
Fetched 30 repositories for user Sahil4883
Fetched 65 repositories fo

In [8]:
import pandas as pd # Import the pandas library and assign it the alias 'pd'

users = pd.read_csv('users.csv')
users.head()
users['hireable'] = users['hireable'].fillna(False).astype(bool)
top5 = users.sort_values(by='followers', ascending=False).head()
print(','.join(top5['login'].tolist()))

ValentineFernandes,kovidgoyal,slidenerd,aryashah2k,coding-parrot


  users['hireable'] = users['hireable'].fillna(False).astype(bool)


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
users['created_at'] = pd.to_datetime(users['created_at'])
top_earliest = users.sort_values(by='created_at').head()
print(','.join(top_earliest['login'].tolist()))

ivank,sandeepshetty,svs,nitinhayaran,nischal


In [9]:
repos = pd.read_csv('repositories.csv')
repos.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,ValentineFernandes,ValentineFernandes/Age-Calculator-,2022-08-17T06:32:19Z,13,13,CSS,True,True,mit
1,ValentineFernandes,ValentineFernandes/ASP.NET-,2022-04-26T10:12:11Z,18,18,ASP.NET,True,True,
2,ValentineFernandes,ValentineFernandes/Assignment-4.2,2022-04-14T11:55:25Z,15,15,HTML,True,True,
3,ValentineFernandes,ValentineFernandes/Bank-Management-System,2022-04-24T16:24:17Z,26,26,C,True,True,
4,ValentineFernandes,ValentineFernandes/BMI-Calculator-Website,2022-08-17T04:47:27Z,11,11,HTML,True,True,mit


In [10]:
repos['license_name'].value_counts().head(3)



Unnamed: 0_level_0,count
license_name,Unnamed: 1_level_1
mit,8793
apache-2.0,2088
other,1847


In [11]:
users['company'].value_counts().head(1)

Unnamed: 0_level_0,count
company,Unnamed: 1_level_1
MASAI SCHOOL,13


In [12]:
repos['language'].value_counts().head(1)


Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
JavaScript,8166


In [13]:
users_after_2020 = users[users['created_at'] > '2020-01-01']
users_after_2020.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,ValentineFernandes,Valentine Fernandes,,"Mumbai, India",,False,HTML | CSS | JS | SQL | MYSQL | JAVA,66,5251,5275,2022-01-29T08:11:37Z
3,aryashah2k,Arya Shah,OPENAOD,"Mumbai, India",,False,Computer Science Major | Machine Learning | So...,88,2601,2583,2020-11-03T03:06:19Z
4,coding-parrot,Gaurav Sen,INTERVIEWREADY,"Mumbai, India",,False,CEO of InterviewReady,11,2417,0,2020-01-03T14:13:35Z
7,Kushal334,Kushal Shingote,PRACTO TOKOPEDIA,"Mumbai, Maharashtra",kushalshingote2@gmail.com,False,Android Developer📱📱\r\niOS Apps📱📱\r\nSwift | X...,330,1308,9058,2020-06-07T07:21:19Z
11,LakshyaDuhoonISU,Lakshya Duhoon,,"Navi Mumbai, India",,False,B.Tech CSE student at ITM Skills University.\r...,69,891,1536,2023-08-21T06:05:19Z


In [14]:
repos_2020 = repos[repos['login'].isin(users_after_2020['login'].tolist())]
repos_2020['language'].value_counts().head()

Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
JavaScript,3205
HTML,1261
Python,1061
Jupyter Notebook,756
TypeScript,637


In [15]:
avg_stars = repos.groupby('language')['stargazers_count'].mean()
top_lang = avg_stars.idxmax()
top_stars = avg_stars.max()
print(top_lang, top_stars)

TSQL 571.4615384615385


In [16]:
users['leader_strength'] = users['followers'] / (1 + users['following'])
top5_lead = users.sort_values(by='leader_strength', ascending=False).head()
print(','.join(top5_lead['login'].tolist()))

kovidgoyal,coding-parrot,gkcs,slidenerd,dmalvia


In [17]:
correlation = users['followers'].corr(users['public_repos'])
correlation

0.03234365699357679

In [18]:
import csv
followers = []
public_repos = []
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        followers_count = int(row['followers'])
        public_repos_count = int(row['public_repos'])
        followers.append(followers_count)
        public_repos.append(public_repos_count)
if len(followers) > 1 and len(public_repos) > 1:
    slope, intercept = np.polyfit(public_repos, followers, 1)

    print(f"{slope:.3f}")
else:
    print("Error")

0.095


In [19]:
if repos['has_projects'].dtype == 'object':
    repos['has_projects'] = repos['has_projects'].map({'true': True, 'false': False})
if repos['has_wiki'].dtype == 'object':
    repos['has_wiki'] = repos['has_wiki'].map({'true': True, 'false': False})

correlation = repos['has_projects'].corr(repos['has_wiki'])

print(round(correlation, 3))

0.155


In [20]:
hireable_avg_following = users[users['hireable'] == True]['following'].mean()
non_hireable_avg_following = users[users['hireable'] == False]['following'].mean()
difference = hireable_avg_following - non_hireable_avg_following
difference

9.681847755367613

In [21]:
from sklearn.linear_model import LinearRegression
users_with_bio = users[(users['bio'].notna()) & (users['bio'] != '')].copy()
users_with_bio.loc[:, 'bio_len'] = users_with_bio['bio'].str.len()

X = users_with_bio['bio_len'].values.reshape(-1,1)
y = users_with_bio['followers']

lr2 = LinearRegression()
lr2.fit(X, y)
lr2.coef_[0]

-0.15803112208459824

In [22]:
import csv
from collections import Counter
from datetime import datetime

weekend_repo_counts = Counter()

with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        created_at = row.get('created_at', '')
        if created_at:
            created_date = datetime.fromisoformat(created_at[:-1])

            if created_date.weekday() in [5, 6]:
                user_login = row['login']
                weekend_repo_counts[user_login] += 1

top_users = weekend_repo_counts.most_common(5)

top_logins = [user[0] for user in top_users]

print(','.join(top_logins))

mohd14shoeb,vinod1988,Kushal334,patilswapnilv,alokproc


In [23]:
fraction_hierable = users[users['hireable'] == True]['email'].notna().mean()
fraction_non_hierable = users[users['hireable'] == False]['email'].notna().mean()
diff = fraction_hierable - fraction_non_hierable
diff

0.21461721969204073

In [24]:
new_users = users[users['name'].notna()].copy()
new_users['surname'] = new_users['name'].str.split().str[-1].str.strip()
surname_counts = new_users['surname'].value_counts()
max_count = surname_counts.max()
common_surnames = surname_counts[surname_counts == max_count].index.tolist()
common_surnames.sort()
print(','.join(common_surnames))

Singh
