In [10]:
#users.csv
import requests
import csv

# Replace with your personal access token
GITHUB_TOKEN = 'token'
HEADERS = {'Authorization': f'token {GITHUB_TOKEN}'}
BASE_URL = 'https://api.github.com'

def get_users_in_bangalore():
    users = []
    page = 1

    while True:
        response = requests.get(f'{BASE_URL}/search/users',
                                headers=HEADERS,
                                params={'q': 'location:Melbourne followers:>100', 'page': page})
        data = response.json()

        if 'items' not in data or not data['items']:
            break

        for user in data['items']:
            users.append(user['login'])

        page += 1

    return users

def get_user_details(username):
    response = requests.get(f'{BASE_URL}/users/{username}', headers=HEADERS)
    return response.json()

def clean_company(company):
    if company:
        return company.strip().lstrip('@').upper()
    return None

def main():
    users = get_users_in_bangalore()
    user_details = []

    for user in users:
        details = get_user_details(user)
        user_details.append({
            'login': details.get('login'),
            'name': details.get('name'),
            'company': clean_company(details.get('company')),
            'location': details.get('location'),
            'email': details.get('email'),
            'hireable': details.get('hireable'),
            'bio': details.get('bio'),
            'public_repos': details.get('public_repos'),
            'followers': details.get('followers'),
            'following': details.get('following'),
            'created_at': details.get('created_at'),
        })

    # Write to CSV
    with open('users.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['login', 'name', 'company', 'location', 'email',
                      'hireable', 'bio', 'public_repos', 'followers',
                      'following', 'created_at']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for user in user_details:
            writer.writerow(user)

if __name__ == '__main__':
    main()


In [11]:
#more cleaned users.csv
import pandas as pd

# Load the data
users_df = pd.read_csv('users.csv')

# Clean the company names
users_df['company'] = users_df['company'].str.strip()  # Trim whitespace
users_df['company'] = users_df['company'].str.lstrip('@')  # Strip leading '@'
users_df['company'] = users_df['company'].str.upper()  # Convert to uppercase

# Save the cleaned DataFrame back to users.csv
users_df.to_csv('users.csv', index=False)

print("Company names cleaned and saved to users.csv.")

Company names cleaned and saved to users.csv.


In [13]:
#repositories.csv
import requests
import csv

# Replace with your personal access token
GITHUB_TOKEN = 'token'
HEADERS = {'Authorization': f'token {GITHUB_TOKEN}'}
BASE_URL = 'https://api.github.com'

def read_users_from_csv(file_path):
    users = []
    with open(file_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            users.append(row['login'])
    return users

def get_repositories(username):
    repos = []
    page = 1
    while True:
        response = requests.get(f'{BASE_URL}/users/{username}/repos',
                                headers=HEADERS,
                                params={'sort': 'pushed', 'direction': 'desc', 'per_page': 100, 'page': page})
        data = response.json()

        if not data or len(repos) >= 500:
            break

        for repo in data:
            repos.append({
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo['language'],
                'has_projects': repo['has_projects'],
                'has_wiki': repo['has_wiki'],
                'license_name': repo['license']['key'] if repo.get('license') else None  # Safely fetch license key
            })

        page += 1

    return repos[:500]  # Return up to 500 repos

def main():
    users = read_users_from_csv('users.csv')
    all_repos = []

    for user in users:
        repos = get_repositories(user)
        for repo in repos:
            all_repos.append({
                'login': user,
                **repo
            })

    # Write to CSV
    with open('repositories.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['login', 'full_name', 'created_at',
                      'stargazers_count', 'watchers_count',
                      'language', 'has_projects',
                      'has_wiki', 'license_name']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for repo in all_repos:
            writer.writerow(repo)

if __name__ == '__main__':
    main()


In [15]:
#Q1
import pandas as pd

# Load the users.csv file
users_df = pd.read_csv("users.csv")

# Sort users by the number of followers in descending order
top_users = users_df.sort_values(by="followers", ascending=False).head(5)

# Extract the 'login' column for the top 5 users and join them as a comma-separated string
top_users_list = ", ".join(top_users['login'].tolist())

print("Top 5 users by followers:", top_users_list)

Top 5 users by followers: mosh-hamedani, TheCherno, haileys, rstacruz, jesseduffield


In [16]:

#Q2: Who are the 5 earliest registered GitHub users in Melbourne? List their login in ascending order of created_at, comma-separated.

import pandas as pd

# Load the users.csv file
users_df = pd.read_csv("users.csv")

# Convert 'created_at' to datetime format for accurate sorting
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Sort users by 'created_at' in ascending order
earliest_users = users_df.sort_values(by="created_at", ascending=True).head(5)

# Extract the 'login' column for the earliest 5 users and join them as a comma-separated string
earliest_users_list = ", ".join(earliest_users['login'].tolist())

print("5 earliest registered users:", earliest_users_list)

5 earliest registered users: toolmantim, crafterm, dgoodlad, Sutto, mdub


In [17]:
#Q3: 3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

import pandas as pd

# Load the repositories.csv file
repos_df = pd.read_csv("repositories.csv")

# Filter out missing licenses
filtered_repos = repos_df[repos_df['license_name'].notna()]

# Count occurrences of each license and sort in descending order
top_licenses = (
    filtered_repos['license_name']
    .value_counts()
    .head(3)
)

# Convert the top licenses to a comma-separated string
top_licenses_list = ", ".join(top_licenses.index.tolist())

print("3 most popular licenses:", top_licenses_list)

3 most popular licenses: mit, other, apache-2.0


In [18]:
#Q4: 4. Which company do the majority of these developers work at? Company (cleaned up as explained above)

import pandas as pd

# Load the users.csv file
users_df = pd.read_csv("users.csv")

# Remove missing values in the 'company' field for analysis
filtered_users = users_df[users_df['company'].notna()]

# Count occurrences of each company and get the one with the highest count
most_common_company = (
    filtered_users['company']
    .value_counts()
    .idxmax()
)

print("Company with the most developers:", most_common_company)

Company with the most developers: MONASH UNIVERSITY


In [19]:
#Q5: 5. Which programming language is most popular among these users?
import pandas as pd

# Load the repositories.csv file
repos_df = pd.read_csv("repositories.csv")

# Remove rows with missing language information
filtered_repos = repos_df[repos_df['language'].notna()]

# Count occurrences of each language and get the most common one
most_common_language = (
    filtered_repos['language']
    .value_counts()
    .idxmax()
)

print("Most popular programming language:", most_common_language)

Most popular programming language: JavaScript


In [20]:
#Q6: Which programming language is the second most popular among users who joined after 2020?

import pandas as pd

# Load the users.csv and repositories.csv files
users_df = pd.read_csv("users.csv")
repos_df = pd.read_csv("repositories.csv")

# Convert 'created_at' to datetime format
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Filter users who joined after 2020
users_after_2020 = users_df[users_df['created_at'].dt.year > 2020]

# Filter repositories for users who joined after 2020
repos_after_2020 = repos_df[repos_df['login'].isin(users_after_2020['login'])]

# Remove missing language values
filtered_repos = repos_after_2020[repos_after_2020['language'].notna()]

# Count occurrences of each language
language_counts = filtered_repos['language'].value_counts()

# Check if there are at least two languages to avoid an IndexError
if len(language_counts) >= 2:
    second_most_common_language = language_counts.index[1]  # Second most popular language
    print("Second most popular programming language for users who joined after 2020:", second_most_common_language)
else:
    print("Not enough language data available for users who joined after 2020 to determine the second most popular language.")


Second most popular programming language for users who joined after 2020: JavaScript


In [21]:
#Q7: Which language has the highest average number of stars per repository?

import pandas as pd

# Load the repositories.csv file
repos_df = pd.read_csv("repositories.csv")

# Remove rows where 'language' or 'stargazers_count' is missing
filtered_repos = repos_df.dropna(subset=['language', 'stargazers_count'])

# Group by 'language' and calculate the average 'stargazers_count'
average_stars = filtered_repos.groupby('language')['stargazers_count'].mean()

# Identify the language with the highest average number of stars
top_language = average_stars.idxmax()
top_average_stars = average_stars.max()

print(f"Language with the highest average number of stars per repository: {top_language} ({top_average_stars:.2f} stars)")


Language with the highest average number of stars per repository: D (2523.00 stars)


In [22]:
#Q8 Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.

import pandas as pd

# Load the users.csv file
users_df = pd.read_csv("users.csv")

# Calculate leader_strength as followers / (1 + following)
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Sort by leader_strength in descending order and get the top 5
top_leaders = users_df.sort_values(by='leader_strength', ascending=False).head(5)

# Extract the 'login' of the top 5 users and join them as a comma-separated string
top_leaders_logins = ", ".join(top_leaders['login'].tolist())

print("Top 5 users by leader_strength:", top_leaders_logins)


Top 5 users by leader_strength: mosh-hamedani, binarythistle, TheCherno, TuPayChain, rogerclarkmelbourne


In [24]:
#Q9 What is the correlation between the number of followers and the number of public repositories among users in Melbourne?

import pandas as pd

# Load the users.csv file
users_df = pd.read_csv("users.csv")

# Calculate the correlation between 'followers' and 'public_repos'
correlation = users_df['followers'].corr(users_df['public_repos'])

# Print the correlation rounded to 3 decimal places
print(f"Correlation between followers and public repositories: {correlation:.3f}")


Correlation between followers and public repositories: 0.188


In [25]:
#Q10 Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

import pandas as pd
from scipy.stats import linregress

# Load the users.csv file
users_df = pd.read_csv("users.csv")

# Perform linear regression with 'public_repos' as the predictor and 'followers' as the response variable
slope, intercept, r_value, p_value, std_err = linregress(users_df['public_repos'], users_df['followers'])

# Print the slope, which represents the estimated increase in followers per additional repository
print(f"Regression slope of followers on repos: {slope:.3f}")


Regression slope of followers on repos: 2.242


In [43]:
#Q11 Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?
import pandas as pd

# Load the data
repositories_df = pd.read_csv('repositories.csv')


# Calculate the correlation directly
correlation = repositories_df['has_projects'].astype(int).corr(repositories_df['has_wiki'].astype(int))

print(f"The correlation between having projects enabled and having a wiki enabled is: {correlation:.3f}")

The correlation between having projects enabled and having a wiki enabled is: 0.380


In [44]:
#Q12 Do hireable users follow more people than those who are not hireable?
import pandas as pd

# Load the users data from the CSV file
users_df = pd.read_csv('users.csv')

# Filter hireable and non-hireable users
hireable_users = users_df[users_df['hireable'] == True]
non_hireable_users = users_df[users_df['hireable'].isna() | (users_df['hireable'] == False)]

# Calculate average following for both groups
average_hireable_following = hireable_users['following'].mean()
average_non_hireable_following = non_hireable_users['following'].mean()

# Calculate the difference
difference = average_hireable_following - average_non_hireable_following

# Print the result rounded to three decimal places
print(f'Difference in average following (hireable - non-hireable): {difference:.3f}')

Difference in average following (hireable - non-hireable): -45.901


In [31]:
#Q13 Some developers write long bios. Does that help them get more followers? What's the impact of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)

import pandas as pd
from scipy.stats import linregress

# Load the users.csv file
users_df = pd.read_csv("users.csv")

# Filter out users without bios
users_with_bio = users_df[users_df['bio'].notna()]

# Calculate the bio word count by splitting on whitespace and counting words
users_with_bio['bio_word_count'] = users_with_bio['bio'].apply(lambda x: len(x.split()))

# Perform linear regression with 'bio_word_count' as the predictor and 'followers' as the response variable
slope, intercept, r_value, p_value, std_err = linregress(users_with_bio['bio_word_count'], users_with_bio['followers'])

# Print the slope, which represents the impact of bio length on followers
print(f"Regression slope of followers on bio word count: {slope:.3f}")


Regression slope of followers on bio word count: 7.351


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bio['bio_word_count'] = users_with_bio['bio'].apply(lambda x: len(x.split()))


In [32]:
#Q14 Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

import pandas as pd

# Load the repositories.csv file
repos_df = pd.read_csv("repositories.csv")

# Convert 'created_at' to datetime format
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'], utc=True)

# Extract the day of the week (0=Monday, 6=Sunday)
repos_df['day_of_week'] = repos_df['created_at'].dt.dayofweek

# Filter for weekends (5=Saturday, 6=Sunday)
weekend_repos = repos_df[repos_df['day_of_week'].isin([5, 6])]

# Count the number of weekend-created repositories per user
weekend_repo_counts = weekend_repos['login'].value_counts()

# Get the top 5 users
top_5_users = weekend_repo_counts.head(5)

# Print the top 5 users' logins, comma-separated
print(','.join(top_5_users.index))


wolfeidau,karkranikhil,roachhd,plutext,rstacruz


In [45]:
#Q15 Do people who are hireable share their email addresses more often?
import pandas as pd

# Load the users data from the CSV file
users_df = pd.read_csv('users.csv')

# Total number of users
total_users = len(users_df)

# Filter hireable and non-hireable users
hireable_users = users_df[users_df['hireable'] == True]
non_hireable_users = users_df[users_df['hireable'].isna() | (users_df['hireable'] == False)]

# Calculate the fraction of users with email in both groups
fraction_hireable_with_email = hireable_users['email'].notna().mean()
fraction_non_hireable_with_email = non_hireable_users['email'].notna().mean()

# Calculate the difference
difference = fraction_hireable_with_email - fraction_non_hireable_with_email

# Print the result rounded to three decimal places
print(f'Difference in fraction of users with email: {difference:.3f}')


Difference in fraction of users with email: 0.048


In [40]:
#Q16 Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

import csv
from collections import Counter

# Initialize a Counter to store surname frequencies
surname_counts = Counter()

# Open and read the users.csv file
with open('users.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # Get the 'name' field and trim whitespace
        name = row.get('name', '').strip()
        if name:
            # Split the name by whitespace
            name_parts = name.split()
            if name_parts:
                # Assume the last word is the surname
                surname = name_parts[-1]
                # Update the surname count
                surname_counts[surname] += 1

# Find the maximum frequency
if surname_counts:
    max_frequency = max(surname_counts.values())
    # Find all surnames with the maximum frequency
    most_common_surnames = [surname for surname, count in surname_counts.items() if count == max_frequency]
    # Sort surnames alphabetically
    most_common_surnames.sort()
    # Join surnames with commas
    result = ', '.join(most_common_surnames)
    print(f"Most common surname(s): {result}")
else:
    print("No surnames found in the data.")


Most common surname(s): Jackson, Wang
