In [3]:
import requests
import time
import json
from typing import List, Dict
from datetime import datetime

class GitHubScraper:
    def __init__(self, token: str):
        """
        Initialize the scraper with GitHub API token
        
        Args:
            token (str): GitHub Personal Access Token
        """
        self.headers = {
            'Authorization': f'token {token}',
            'Accept': 'application/vnd.github.v3+json'
        }
        self.base_url = 'https://api.github.com'
        
    def search_users(self, min_followers: int = 100, location: str = 'Chicago') -> List[Dict]:
        """
        Search for GitHub users in specified location with minimum followers
        
        Args:
            min_followers (int): Minimum number of followers
            location (str): Location to search for
            
        Returns:
            List[Dict]: List of user data dictionaries
        """
        users = []
        page = 1
        
        while True:
            search_url = f'{self.base_url}/search/users'
            params = {
                'q': f'location:{location} followers:>={min_followers}',
                'per_page': 100,
                'page': page
            }
            
            response = requests.get(search_url, headers=self.headers, params=params)
            
            if response.status_code != 200:
                print(f"Error searching users: {response.status_code}")
                print(response.json())
                break
                
            data = response.json()
            if not data['items']:
                break
                
            # Get detailed information for each user
            for user in data['items']:
                user_detail = self.get_user_details(user['login'])
                if user_detail:
                    users.append(user_detail)
                    
                # Respect API rate limits
                time.sleep(1)
                
            page += 1
            
        return users
    
    def get_user_details(self, username: str) -> Dict:
        """
        Get detailed information for a specific user
        
        Args:
            username (str): GitHub username
            
        Returns:
            Dict: User details dictionary
        """
        user_url = f'{self.base_url}/users/{username}'
        response = requests.get(user_url, headers=self.headers)
        
        if response.status_code != 200:
            print(f"Error fetching user details for {username}: {response.status_code}")
            return None
            
        return response.json()
    
    def get_user_repositories(self, username: str) -> List[Dict]:
        """
        Get repositories for a specific user
        
        Args:
            username (str): GitHub username
            
        Returns:
            List[Dict]: List of repository dictionaries
        """
        repos = []
        page = 1
        
        while True:
            repos_url = f'{self.base_url}/users/{username}/repos'
            params = {
                'per_page': 100,
                'page': page
            }
            
            response = requests.get(repos_url, headers=self.headers, params=params)
            
            if response.status_code != 200:
                print(f"Error fetching repositories for {username}: {response.status_code}")
                break
                
            data = response.json()
            if not data:
                break
                
            repos.extend(data)
            page += 1
            
        return repos

def main():
    # Replace with your GitHub Personal Access Token
    token = 'key'
    scraper = GitHubScraper(token)
    
    # Search for users in Chicago with >100 followers
    users = scraper.search_users(min_followers=100, location='Chicago')
    
    # Create output structure
    output = []
    for user in users:
        user_data = user.copy()
        # Add repositories to user data
        user_data['repositories'] = scraper.get_user_repositories(user['login'])
        output.append(user_data)
        
        # Respect API rate limits
        time.sleep(1)
    
    # Save results to file
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'github_chicago_users_{timestamp}.json'
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=2)
        
    print(f"Found {len(users)} users. Data saved to {filename}")

if __name__ == "__main__":
    main()

Found 380 users. Data saved to github_chicago_users_20241030_120752.json


In [1]:
import pandas as pd
import json

def load_github_json(filename: str) -> pd.DataFrame:
    """
    Load GitHub users JSON file and convert it to a DataFrame with specified columns
    
    Args:
        filename (str): Path to the JSON file
        
    Returns:
        pd.DataFrame: DataFrame containing user information
    """
    # Define the columns we want to extract
    columns = [
        'login', 'id', 'node_id', 'avatar_url', 'gravatar_id', 'url', 
        'html_url', 'followers_url', 'following_url', 'gists_url', 
        'starred_url', 'subscriptions_url', 'organizations_url', 'repos_url',
        'events_url', 'received_events_url', 'type', 'user_view_type', 
        'site_admin', 'name', 'company', 'blog', 'location', 'email',
        'hireable', 'bio', 'twitter_username', 'public_repos', 'public_gists',
        'followers', 'following', 'created_at', 'updated_at'
    ]
    
    # Read JSON file
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found")
        return None
    except json.JSONDecodeError:
        print(f"Error: File '{filename}' is not valid JSON")
        return None
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    # Select only the specified columns
    # If a column doesn't exist, it will be filled with NaN
    df = df.reindex(columns=columns)
    
    # Convert timestamp strings to datetime objects
    timestamp_columns = ['created_at', 'updated_at']
    for col in timestamp_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col])
    
    # Print basic information about the DataFrame
    print(f"\nDataFrame Info:")
    print(f"Number of users: {len(df)}")
    print(f"Number of columns: {len(df.columns)}")
    print("\nMissing columns:", set(columns) - set(df.columns))
    print("\nFirst few rows:")
    print(df.head())
    
    return df

def main():
    # File path
    filename = '/home/neil/Desktop/github_chicago_users_20241030_120752.json'
    
    # Load and convert the data
    df = load_github_json(filename)
    
    if df is not None:
        # Save to CSV for easy viewing
        csv_filename = filename.replace('.json', '.csv')
        df.to_csv(csv_filename, index=False)
        print(f"\nData saved to {csv_filename}")
        
        # Basic data analysis
        print("\nBasic Statistics:")
        print(f"Total number of users: {len(df)}")
        print(f"Average followers: {df['followers'].mean():.2f}")
        print(f"Average public repos: {df['public_repos'].mean():.2f}")
        print("\nTop 5 users by followers:")
        print(df.nlargest(5, 'followers')[['login', 'followers', 'public_repos']])

if __name__ == "__main__":
    main()


DataFrame Info:
Number of users: 380
Number of columns: 33

Missing columns: set()

First few rows:
         login       id               node_id  \
0     cassidoo  1454517  MDQ6VXNlcjE0NTQ1MTc=   
1     felangel  8855632  MDQ6VXNlcjg4NTU2MzI=   
2       dabeaz   350836  MDQ6VXNlcjM1MDgzNg==   
3  sstephenson     2603      MDQ6VXNlcjI2MDM=   
4  mattgodbolt   633973  MDQ6VXNlcjYzMzk3Mw==   

                                          avatar_url gravatar_id  \
0  https://avatars.githubusercontent.com/u/145451...               
1  https://avatars.githubusercontent.com/u/885563...               
2  https://avatars.githubusercontent.com/u/350836...               
3   https://avatars.githubusercontent.com/u/2603?v=4               
4  https://avatars.githubusercontent.com/u/633973...               

                                        url                        html_url  \
0     https://api.github.com/users/cassidoo     https://github.com/cassidoo   
1     https://api.github.com/users/fe

In [3]:
filepath='/home/neil/Desktop/github_chicago_users_20241030_120752.csv'
df = pd.read_csv(filepath)

In [4]:
df

Unnamed: 0,login,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,gists_url,...,email,hireable,bio,twitter_username,public_repos,public_gists,followers,following,created_at,updated_at
0,cassidoo,1454517,MDQ6VXNlcjE0NTQ1MTc=,https://avatars.githubusercontent.com/u/145451...,,https://api.github.com/users/cassidoo,https://github.com/cassidoo,https://api.github.com/users/cassidoo/followers,https://api.github.com/users/cassidoo/followin...,https://api.github.com/users/cassidoo/gists{/g...,...,,,Making memes and dreams... and software,cassidoo,165,47,13380,102,2012-02-20 16:36:23+00:00,2024-10-24 18:59:19+00:00
1,felangel,8855632,MDQ6VXNlcjg4NTU2MzI=,https://avatars.githubusercontent.com/u/885563...,,https://api.github.com/users/felangel,https://github.com/felangel,https://api.github.com/users/felangel/followers,https://api.github.com/users/felangel/followin...,https://api.github.com/users/felangel/gists{/g...,...,felangelov@gmail.com,,"software engineer by day, software engineer by...",felangelov,125,293,8672,67,2014-09-22 02:35:58+00:00,2024-10-28 17:01:55+00:00
2,dabeaz,350836,MDQ6VXNlcjM1MDgzNg==,https://avatars.githubusercontent.com/u/350836...,,https://api.github.com/users/dabeaz,https://github.com/dabeaz,https://api.github.com/users/dabeaz/followers,https://api.github.com/users/dabeaz/following{...,https://api.github.com/users/dabeaz/gists{/gis...,...,dave@dabeaz.com,,Author of the Python Essential Reference (Addi...,,34,10,5180,0,2010-08-01 15:22:48+00:00,2024-01-08 14:46:16+00:00
3,sstephenson,2603,MDQ6VXNlcjI2MDM=,https://avatars.githubusercontent.com/u/2603?v=4,,https://api.github.com/users/sstephenson,https://github.com/sstephenson,https://api.github.com/users/sstephenson/follo...,https://api.github.com/users/sstephenson/follo...,https://api.github.com/users/sstephenson/gists...,...,sam@sls.name,,,,24,58,3761,0,2008-03-08 22:17:24+00:00,2024-09-28 16:33:20+00:00
4,mattgodbolt,633973,MDQ6VXNlcjYzMzk3Mw==,https://avatars.githubusercontent.com/u/633973...,,https://api.github.com/users/mattgodbolt,https://github.com/mattgodbolt,https://api.github.com/users/mattgodbolt/follo...,https://api.github.com/users/mattgodbolt/follo...,https://api.github.com/users/mattgodbolt/gists...,...,matt@godbolt.org,,"Compiler Explorer and jsbeeb creator, ex-Googl...",,84,17,3397,97,2011-02-23 13:46:48+00:00,2024-10-24 02:11:12+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,cwarden,387035,MDQ6VXNlcjM4NzAzNQ==,https://avatars.githubusercontent.com/u/387035...,,https://api.github.com/users/cwarden,https://github.com/cwarden,https://api.github.com/users/cwarden/followers,https://api.github.com/users/cwarden/following...,https://api.github.com/users/cwarden/gists{/gi...,...,cwarden@xerus.org,True,Salesforce Development Professional/Linux Afic...,,261,68,100,129,2010-09-03 22:16:37+00:00,2024-02-04 12:55:56+00:00
376,nadrane,1634953,MDQ6VXNlcjE2MzQ5NTM=,https://avatars.githubusercontent.com/u/163495...,,https://api.github.com/users/nadrane,https://github.com/nadrane,https://api.github.com/users/nadrane/followers,https://api.github.com/users/nadrane/following...,https://api.github.com/users/nadrane/gists{/gi...,...,nicholasDrane@gmail.com,True,,,64,3,100,1,2012-04-11 23:56:43+00:00,2023-10-13 19:52:38+00:00
377,rogeruiz,706004,MDQ6VXNlcjcwNjAwNA==,https://avatars.githubusercontent.com/u/706004...,,https://api.github.com/users/rogeruiz,https://github.com/rogeruiz,https://api.github.com/users/rogeruiz/followers,https://api.github.com/users/rogeruiz/followin...,https://api.github.com/users/rogeruiz/gists{/g...,...,hi@rog.gr,True,"🎶 soy un in-ge-niero. I'm a computer, baby. so...",rogeruiz,197,175,100,293,2011-04-02 22:38:41+00:00,2024-10-24 13:58:43+00:00
378,scottferg,74309,MDQ6VXNlcjc0MzA5,https://avatars.githubusercontent.com/u/74309?v=4,,https://api.github.com/users/scottferg,https://github.com/scottferg,https://api.github.com/users/scottferg/followers,https://api.github.com/users/scottferg/followi...,https://api.github.com/users/scottferg/gists{/...,...,,,,,66,40,100,46,2009-04-16 03:45:47+00:00,2024-09-20 17:09:58+00:00


In [6]:
import pandas as pd
import numpy as np

def clean_company_name(company):
    """
    Clean company names according to specified rules:
    1. Trim whitespace
    2. Strip leading @ symbol (only the first one)
    3. Convert to uppercase
    
    Args:
        company: Company name to clean
        
    Returns:
        str: Cleaned company name
    """
    if pd.isna(company) or company == '':
        return None
        
    # Convert to string in case it's not
    company = str(company)
    
    # Trim whitespace
    company = company.strip()
    
    # Strip leading @ symbol (only the first one)
    if company.startswith('@'):
        company = company[1:]
        
    # Convert to uppercase
    company = company.upper()
    
    return company

def process_users_data(filepath: str) -> pd.DataFrame:
    """
    Load and process GitHub users data according to specifications
    
    Args:
        filepath (str): Path to the CSV file
        
    Returns:
        pd.DataFrame: Processed DataFrame
    """
    try:
        # Read CSV file
        df = pd.read_csv(filepath)
        
        # Select and reorder specified columns
        columns = [
            'login', 'name', 'company', 'location', 'email', 'hireable',
            'bio', 'public_repos', 'followers', 'following', 'created_at'
        ]
        
        df = df.reindex(columns=columns)
        
        # Clean company names
        df['company'] = df['company'].apply(clean_company_name)
        
        # Convert created_at to datetime
        df['created_at'] = pd.to_datetime(df['created_at'])
        
        # Convert hireable to boolean
        df['hireable'] = df['hireable'].fillna(False).astype(bool)
        
        # Convert numeric columns
        numeric_cols = ['public_repos', 'followers', 'following']
        for col in numeric_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
        
        # Print summary statistics
        print("\nData Summary:")
        print(f"Total users: {len(df)}")
        print(f"Users with company info: {df['company'].notna().sum()}")
        print(f"Average followers: {df['followers'].mean():.1f}")
        print(f"Average public repos: {df['public_repos'].mean():.1f}")
        
        # Print sample of cleaned company names
        print("\nSample of cleaned company names:")
        companies = df[df['company'].notna()]['company'].head()
        for company in companies:
            print(f"  - {company}")
        
        return df
        
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None
    except Exception as e:
        print(f"Error processing file: {str(e)}")
        return None

def main():
    # File path
    filepath = '/home/neil/Desktop/github_chicago_users_20241030_120752.csv'
    
    # Process the data
    df = process_users_data(filepath)
    
    if df is not None:
        # Save processed data
        output_file = 'processed_users.csv'
        df.to_csv(output_file, index=False)
        print(f"\nProcessed data saved to {output_file}")
        
        # Display additional analysis
        print("\nTop 5 users by followers:")
        top_users = df.nlargest(5, 'followers')[['login', 'name', 'company', 'followers']]
        print(top_users)
        
        print("\nMost common companies:")
        company_counts = df['company'].value_counts().head()
        print(company_counts)
        
        print("\nUsers open to hiring:")
        hireable_users = df[df['hireable']][['login', 'name', 'company']].head()
        print(hireable_users)
        
        # Basic statistics
        print("\nBasic statistics:")
        print(df[['followers', 'following', 'public_repos']].describe())

if __name__ == "__main__":
    main()


Data Summary:
Total users: 380
Users with company info: 221
Average followers: 387.4
Average public repos: 96.5

Sample of cleaned company names:
  - GITHUB
  - SHOREBIRDTECH
  - DABEAZ, LLC
  - AQUATIC CAPITAL MANAGEMENT
  - GOOGLE

Processed data saved to processed_users.csv

Top 5 users by followers:
         login              name                     company  followers
0     cassidoo  Cassidy Williams                      GITHUB      13380
1     felangel     Felix Angelov               SHOREBIRDTECH       8672
2       dabeaz     David Beazley                 DABEAZ, LLC       5180
3  sstephenson    Sam Stephenson                        None       3761
4  mattgodbolt      Matt Godbolt  AQUATIC CAPITAL MANAGEMENT       3397

Most common companies:
company
UNIVERSITY OF CHICAGO          12
GOOGLE                          6
GITHUB                          5
NORTHWESTERN UNIVERSITY         4
ARGONNE NATIONAL LABORATORY     3
Name: count, dtype: int64

Users open to hiring:
           

  df['hireable'] = df['hireable'].fillna(False).astype(bool)


In [17]:
import pandas as pd
import requests
import time
from datetime import datetime
import os

def get_github_repos(username, headers):
    """Fetch repositories for a given GitHub user."""
    repos = []
    page = 1
    
    while len(repos) < 500:  # Limit to 500 repos per user
        url = f"https://api.github.com/users/{username}/repos"
        params = {
            'sort': 'pushed',
            'direction': 'desc',
            'per_page': 100,
            'page': page
        }
        
        response = requests.get(url, headers=headers, params=params)
        
        if response.status_code == 404:
            print(f"User {username} not found")
            break
        elif response.status_code != 200:
            print(f"Error fetching repos for {username}: {response.status_code}")
            print(f"Response: {response.text}")
            break
            
        page_repos = response.json()
        if not page_repos:  # No more repos
            break
            
        repos.extend(page_repos)
        page += 1
        
        # Respect rate limits
        if response.headers.get('X-RateLimit-Remaining') == '0':
            reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
            sleep_time = max(reset_time - time.time(), 0) + 1
            print(f"Rate limit reached. Sleeping for {sleep_time} seconds...")
            time.sleep(sleep_time)
    
    return repos[:500]  # Ensure we don't exceed 500 repos

def process_repos(repos, username):
    """Process repository data and return as list of dictionaries."""
    processed_repos = []
    
    for repo in repos:
        processed_repo = {
            'login': username,
            'full_name': repo.get('full_name', ''),
            'created_at': repo.get('created_at', ''),
            'stargazers_count': repo.get('stargazers_count', 0),
            'watchers_count': repo.get('watchers_count', 0),
            'language': repo.get('language', ''),
            'has_projects': repo.get('has_projects', False),
            'has_wiki': repo.get('has_wiki', False),
            'license_name': repo.get('license', {}).get('key', '')
        }
        processed_repos.append(processed_repo)
    
    return processed_repos

def main():
    # Direct token assignment (replace with your actual token)
    github_token = 'key'
    
    headers = {
        'Authorization': f'Bearer {github_token}',  # Changed to Bearer authentication
        'Accept': 'application/vnd.github.v3+json'
    }
    
    # Read users CSV
    users_df = pd.read_csv('/home/neil/Desktop/github_chicago_users_20241030_120752.csv')
    
    # Initialize list to store all repository data
    all_repos = []
    
    # Process each user
    total_users = len(users_df)
    for idx, user in enumerate(users_df['login'], 1):
        print(f"Processing user {idx}/{total_users}: {user}")
        
        try:
            # Fetch repos for user
            repos = get_github_repos(user, headers)
            
            # Process repos and add to main list
            processed_repos = process_repos(repos, user)
            all_repos.extend(processed_repos)
            
        except Exception as e:
            print(f"Error processing user {user}: {str(e)}")
            continue
        
        # Add small delay between users to be nice to GitHub API
        time.sleep(1)
    
    # Convert to DataFrame and save to CSV
    repos_df = pd.DataFrame(all_repos)
    repos_df.to_csv('repositories.csv', index=False)
    print(f"Saved {len(repos_df)} repositories to repositories.csv")

if __name__ == "__main__":
    main()

Processing user 1/380: cassidoo
Error processing user cassidoo: 'NoneType' object has no attribute 'get'
Processing user 2/380: felangel
Error processing user felangel: 'NoneType' object has no attribute 'get'
Processing user 3/380: dabeaz
Error processing user dabeaz: 'NoneType' object has no attribute 'get'
Processing user 4/380: sstephenson
Error processing user sstephenson: 'NoneType' object has no attribute 'get'
Processing user 5/380: mattgodbolt
Error processing user mattgodbolt: 'NoneType' object has no attribute 'get'
Processing user 6/380: logankilpatrick
Error processing user logankilpatrick: 'NoneType' object has no attribute 'get'
Processing user 7/380: khan4019
Error processing user khan4019: 'NoneType' object has no attribute 'get'
Processing user 8/380: adashofdata
Error processing user adashofdata: 'NoneType' object has no attribute 'get'
Processing user 9/380: djspiewak
Error processing user djspiewak: 'NoneType' object has no attribute 'get'
Processing user 10/380: e

KeyboardInterrupt: 

In [22]:
import pandas as pd
import requests
import time
from datetime import datetime
import os
import json

def get_github_repos(username, headers):
    """Fetch repositories for a given GitHub user."""
    repos = []
    page = 1
    
    while len(repos) < 500:  # Limit to 500 repos per user
        url = f"https://api.github.com/users/{username}/repos"
        params = {
            'sort': 'pushed',
            'direction': 'desc',
            'per_page': 100,
            'page': page
        }
        
        try:
            response = requests.get(url, headers=headers, params=params)
            
            # Debug information
            print(f"\nFetching repos for {username} (page {page}):")
            print(f"URL: {response.url}")
            print(f"Status Code: {response.status_code}")
            
            # Handle rate limiting
            remaining = int(response.headers.get('X-RateLimit-Remaining', 0))
            print(f"Rate limit remaining: {remaining}")
            
            if remaining == 0:
                reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
                sleep_time = max(reset_time - time.time(), 0) + 1
                print(f"Rate limit reached. Sleeping for {sleep_time} seconds...")
                time.sleep(sleep_time)
                continue
            
            if response.status_code == 404:
                print(f"User {username} not found")
                return []
            elif response.status_code != 200:
                print(f"Error fetching repos for {username}: {response.status_code}")
                print(f"Response: {response.text}")
                return []
            
            page_repos = response.json()
            
            if not page_repos:  # No more repos
                break
                
            if isinstance(page_repos, list):
                repos.extend(page_repos)
                print(f"Retrieved {len(page_repos)} repos from page {page}")
                if len(page_repos) < 100:  # Last page
                    break
            else:
                print(f"Unexpected response format for {username}")
                print(f"Response: {page_repos}")
                break
                
            page += 1
            time.sleep(1)  # Be nice to the API
            
        except requests.exceptions.RequestException as e:
            print(f"Request error for {username}: {e}")
            return []
        except json.JSONDecodeError as e:
            print(f"JSON decode error for {username}: {e}")
            return []
            
    return repos[:500]  # Ensure we don't exceed 500 repos

def process_repos(repos, username):
    """Process repository data and return as list of dictionaries."""
    processed_repos = []
    
    if not repos:
        print(f"No repositories found for {username}")
        return processed_repos
        
    for repo in repos:
        try:
            # Debug info
            if repo is None:
                print(f"Found null repo object for {username}")
                continue
                
            processed_repo = {
                'login': username,
                'full_name': repo.get('full_name', ''),
                'created_at': repo.get('created_at', ''),
                'stargazers_count': repo.get('stargazers_count', 0),
                'watchers_count': repo.get('watchers_count', 0),
                'language': repo.get('language', ''),
                'has_projects': repo.get('has_projects', False),
                'has_wiki': repo.get('has_wiki', False),
                'license_name': repo.get('license', {}).get('key', '') if repo.get('license') else ''
            }
            processed_repos.append(processed_repo)
        except Exception as e:
            print(f"Error processing repo for {username}: {str(e)}")
            print(f"Problematic repo data: {repo}")
            continue
    
    return processed_repos

def main():
    # Set up authentication headers
    github_token = 'key'  # Should use environment variable in production
    
    headers = {
        'Authorization': f'Bearer {github_token}',
        'Accept': 'application/vnd.github.v3+json'
    }
    
    # Test API connection
    test_response = requests.get('https://api.github.com/user', headers=headers)
    if test_response.status_code != 200:
        print(f"API authentication failed: {test_response.status_code}")
        print(f"Response: {test_response.text}")
        return
    
    # Read users CSV
    try:
        users_df = pd.read_csv('github_chicago_users_20241030_120752.csv')
    except Exception as e:
        print(f"Error reading users CSV: {e}")
        return
    
    # Initialize list to store all repository data
    all_repos = []
    
    # Process each user
    total_users = len(users_df)
    for idx, user in enumerate(users_df['login'], 1):
        print(f"\nProcessing user {idx}/{total_users}: {user}")
        
        try:
            # Fetch repos for user
            repos = get_github_repos(user, headers)
            
            if repos:  # Only process if we got repos back
                print(f"Found {len(repos)} repos for {user}")
                processed_repos = process_repos(repos, user)
                if processed_repos:
                    print(f"Successfully processed {len(processed_repos)} repos for {user}")
                    all_repos.extend(processed_repos)
                else:
                    print(f"No repos were processed for {user}")
            
        except Exception as e:
            print(f"Error processing user {user}: {str(e)}")
            continue
        
        time.sleep(1)  # Be nice to the API
    
    # Convert to DataFrame and save to CSV
    if all_repos:
        repos_df = pd.DataFrame(all_repos)
        repos_df.to_csv('repositories.csv', index=False)
        print(f"\nSaved {len(repos_df)} repositories to repositories.csv")
    else:
        print("\nNo repositories were processed successfully.")

if __name__ == "__main__":
    main()


Processing user 1/380: cassidoo

Fetching repos for cassidoo (page 1):
URL: https://api.github.com/users/cassidoo/repos?sort=pushed&direction=desc&per_page=100&page=1
Status Code: 200
Rate limit remaining: 4928
Retrieved 100 repos from page 1

Fetching repos for cassidoo (page 2):
URL: https://api.github.com/users/cassidoo/repos?sort=pushed&direction=desc&per_page=100&page=2
Status Code: 200
Rate limit remaining: 4927
Retrieved 65 repos from page 2
Found 165 repos for cassidoo
Successfully processed 165 repos for cassidoo

Processing user 2/380: felangel

Fetching repos for felangel (page 1):
URL: https://api.github.com/users/felangel/repos?sort=pushed&direction=desc&per_page=100&page=1
Status Code: 200
Rate limit remaining: 4926
Retrieved 100 repos from page 1

Fetching repos for felangel (page 2):
URL: https://api.github.com/users/felangel/repos?sort=pushed&direction=desc&per_page=100&page=2
Status Code: 200
Rate limit remaining: 4925
Retrieved 25 repos from page 2
Found 125 repos f

In [24]:
# 1. Who are the top 5 users in Chicago with the highest number of followers? List their login in order, comma-separated.
top_users = df.nlargest(5, 'followers')['login'].tolist()
top_users

['cassidoo', 'felangel', 'dabeaz', 'sstephenson', 'mattgodbolt']

In [30]:
#2. Who are the 5 earliest registered GitHub users in Chicago? List their login in ascending order of created_at, comma-separated.
df['created_at'] = pd.to_datetime(df['created_at'])
earliest_users = df.nsmallest(5, 'created_at')['login'].tolist()
earliest_users

['ELLIOTTCABLE', 'trevorturk', 'lukehoersten', 'djspiewak', 'shanesveller']

In [32]:
#list all columns of df
df.columns

Index(['login', 'id', 'node_id', 'avatar_url', 'gravatar_id', 'url',
       'html_url', 'followers_url', 'following_url', 'gists_url',
       'starred_url', 'subscriptions_url', 'organizations_url', 'repos_url',
       'events_url', 'received_events_url', 'type', 'user_view_type',
       'site_admin', 'name', 'company', 'blog', 'location', 'email',
       'hireable', 'bio', 'twitter_username', 'public_repos', 'public_gists',
       'followers', 'following', 'created_at', 'updated_at'],
      dtype='object')

In [33]:
# read the repositories.csv file to dt
dt = pd.read_csv('repositories.csv')

In [38]:
#3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.
popular_licenses = dt['license_name'].value_counts().nlargest(3).index.tolist()
print(popular_licenses)

['mit', 'other', 'apache-2.0']


In [37]:
#4. Which company do the majority of these developers work at?
most_common_company = df['company'].value_counts().idxmax()
print(most_common_company)

University of Chicago


In [39]:
#5. Which programming language is most popular among these users?
most_popular_language = dt['language'].value_counts().idxmax()
print(most_popular_language)

JavaScript


In [40]:
#6. Which programming language is the second most popular among users who joined after 2020?
new_users = df[df['created_at'].dt.year > 2020]
new_repos = dt[dt['login'].isin(new_users['login'])]
second_most_popular_language = new_repos['language'].value_counts().index[1]    
print(second_most_popular_language)

JavaScript


In [41]:
#7. Which language has the highest average number of stars per repository?
stars_per_repo = dt.groupby('language')['stargazers_count'].mean()
highest_avg_stars_language = stars_per_repo.idxmax()
print(highest_avg_stars_language)

Vim Script


In [42]:
#Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.
df['leader_strength'] = df['followers'] / (1 + df['following'])
top_leader_strength = df.nlargest(5, 'leader_strength')['login'].tolist()
top_leader_strength

['dabeaz', 'sstephenson', 'khan4019', 'adashofdata', 'djspiewak']

In [43]:
#9. What is the correlation between the number of followers and the number of public repositories among users in Chicago? Correlation between followers and repos (to 3 decimal places, e.g. 0.123 or -0.123)
correlation = df['followers'].corr(df['public_repos'])
correlation

0.07508240565044703

In [44]:
#10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository. Regression slope of followers on repos (to 3 decimal places, e.g. 0.123 or -0.123)
from sklearn.linear_model import LinearRegression

X = df['public_repos'].values.reshape(-1, 1)
y = df['followers'].values

model = LinearRegression()
model.fit(X, y)
slope = model.coef_[0]

slope

0.6014915352161939

In [45]:
#11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled? Correlation between projects and wiki enabled (to 3 decimal places, e.g. 0.123 or -0.123)
correlation_projects_wiki = dt['has_projects'].corr(dt['has_wiki'])
correlation_projects_wiki


0.2870694402794212

In [49]:
# First, let's look at what values are actually in the hireable column
print("Unique values in 'hireable' column:")
print(df['hireable'].unique())

# Let's also see the value counts
print("\nValue counts in 'hireable' column:")
print(df['hireable'].value_counts())

# Fix: The column might contain None/NaN values, so we need to handle those
# Convert the hireable column to boolean, treating NaN as False
df['hireable_bool'] = df['hireable'].fillna(False)

# Now calculate the difference
hireable_following = df.groupby('hireable_bool')['following'].mean()
hireable_diff = hireable_following[True] - hireable_following[False]

# Format to 3 decimal places
formatted_diff = f"{hireable_diff:.3f}"
print("\nDifference in average following (hireable - not hireable):")
print(formatted_diff)

Unique values in 'hireable' column:
[nan True]

Value counts in 'hireable' column:
hireable
True    97
Name: count, dtype: int64

Difference in average following (hireable - not hireable):
109.770


  df['hireable_bool'] = df['hireable'].fillna(False)


In [50]:
#13. Some developers write long bios. Does that help them get more followers? What's the impact of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios) Regression slope of followers on bio word count (to 3 decimal places, e.g. 12.345 or -12.345)
# Calculate bio word count
df['bio_word_count'] = df['bio'].str.split().str.len()

# Filter out users without a bio
bio_users = df.dropna(subset=['bio'])

# Perform linear regression
X = bio_users['bio_word_count'].values.reshape(-1, 1)
y = bio_users['followers'].values

model = LinearRegression()

model.fit(X, y)
slope_bio = model.coef_[0]

# Format to 3 decimal places
formatted_slope_bio = f"{slope_bio:.3f}"
print("\nRegression slope of followers on bio word count:")
print(formatted_slope_bio)



Regression slope of followers on bio word count:
3.094


In [51]:
#14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated Users login
# Convert created_at to datetime
dt['created_at'] = pd.to_datetime(dt['created_at'])

# Extract day of the week (0=Monday, 6=Sunday)
dt['day_of_week'] = dt['created_at'].dt.dayofweek

# Filter for weekends (5=Saturday, 6=Sunday)
weekend_repos = dt[dt['day_of_week'] >= 5]

# Count repositories by user
top_weekend_users = weekend_repos['login'].value_counts().nlargest(5).index.tolist()
top_weekend_users


['marwahaha', 'eddelbuettel', 'sabre1041', 'erichilarysmithsr', 'yyolk']

In [52]:
#15. Do people who are hireable share their email addresses more often? [fraction of users with email when hireable=true] minus [fraction of users with email for the rest] (to 3 decimal places, e.g. 0.123 or -0.123)

# Calculate the difference in email sharing
email_hireable = df[df['hireable_bool']]['email'].notna().mean()
email_not_hireable = df[~df['hireable_bool']]['email'].notna().mean()
email_diff = email_hireable - email_not_hireable

# Format to 3 decimal places
formatted_email_diff = f"{email_diff:.3f}"
print("\nDifference in email sharing (hireable - not hireable):")
print(formatted_email_diff)



Difference in email sharing (hireable - not hireable):
0.076


In [53]:
#16. Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically) Most common surname(s)
# Extract last word as surname
df['surname'] = df['name'].str.split().str[-1]

# Drop missing surnames
surname_counts = df['surname'].value_counts().dropna()

# Find most common surname(s)
most_common_surnames = surname_counts[surname_counts == surname_counts.max()].index.tolist()
most_common_surnames.sort()
most_common_surnames



['Smith']