# Fetch users based on location

Due to the Github API's limit of returning only the first 1000 items, we must divide the query into intervals based on the account creation dates.

In [39]:
from datetime import datetime, timedelta


def generate_date_intervals(
    interval_days,
    start_year=2008,
    end_date=datetime.now(),
):
    dates = []
    end_year = end_date.year

    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            # Skip months beyond the end date in the final year
            if year == end_year and month > end_date.month:
                break

            # Calculate the number of days in the current month
            if month == 12:
                days_in_month = (
                    datetime(year + 1, 1, 1) - datetime(year, month, 1)
                ).days
            else:
                days_in_month = (
                    datetime(year, month + 1, 1) - datetime(year, month, 1)
                ).days

            # Adjust the days in the month if it's the current month and year
            if year == end_date.year and month == end_date.month:
                days_in_month = end_date.day

            # Loop through the month in intervals of `interval_days`
            for day in range(1, days_in_month + 1, interval_days):
                start_date = datetime(year, month, day)
                # Ensure the end date does not exceed the month or the specified end date
                end_interval_date = min(
                    start_date + timedelta(days=interval_days - 1),
                    datetime(year, month, days_in_month) + timedelta(days=1),
                    end_date
                    + timedelta(days=1),  # Ensure we don't go beyond the end_date
                )

                # Format the dates as strings and add them to the list
                dates.append(
                    f"{start_date.strftime('%Y-%m-%d')}..{end_interval_date.strftime('%Y-%m-%d')}"
                )
    return dates

In [40]:
from urllib.parse import urlparse, parse_qs

def get_page_num(url):
    query_string = urlparse(url).query
    params = parse_qs(query_string)
    return params.get("page", [None])[0]

In [41]:
import requests
import json
import time
import os
from dotenv import load_dotenv

load_dotenv()

token = os.getenv("GITHUB_TOKEN")
headers = {"Authorization": f"Bearer {token}"}
base_url = "https://api.github.com/search/users?"
location = "montreal"

In [42]:
def fetch_users_from_github(location, dates, base_url, headers):
    total_users = []
    log = []

    for idx, date in enumerate(dates):
        # if idx > 0:
        #     print("DEBUG MODE: breaking after 1 iterations")
        #     print(
        #         f"Processed {idx -1} {dates[idx - 1]} out of {len(dates)} / Collected users: {len(users)} Total users: {len(total_users)}"
        #     )
        #     return users
        log_for_date = {"date": date, "messages": [], "overflow": False}
        next_url = "first_page"
        while True:
            if next_url is not None and next_url != "first_page":
                response = requests.get(next_url, headers=headers)
            elif next_url == "first_page":
                response = requests.get(
                    base_url,
                    headers=headers,
                    params={
                        "q": f"location:{location} created:{date}",
                        "page": 1,
                        "per_page": 100,
                        "sort": "joined",
                        "order": "desc",
                    },
                )
                last_url = response.links.get("last", {}).get("url")
                last_page_num = get_page_num(last_url)
                if last_page_num == 10:
                    log_for_date["overflow"] = True

            else:
                break

            if response.status_code == 200:
                users = response.json()["items"]
                if users:
                    total_users.extend(users)
                    log_for_date["messages"].append(f"{len(users)} total user added")
                else:
                    if next_url != "first_page":
                        log_for_date["messages"].append(f"NO USER in: {next_url}")
                next_url = response.links.get("next", {}).get("url")
            else:
                print(f"Failed to fetch repositories: {response.status_code}")
                print(response)
                print("Waiting for 60 seconds")
                time.sleep(60)

            log.extend(log_for_date)

            if response.headers.get("X-RateLimit-Remaining") == "0":
                # print("Rate limit reached. Waiting for 60 seconds")
                print(
                    f"Processed {idx + 1} out of {len(dates)} / Total users: {len(total_users)}"
                )

                # Save the list of users to a file
                with open(f"../data/users_in_{location}.json", "w") as file:
                    json.dump(total_users, file, indent=4)

                # Save the log to a file
                with open(f"../data/log_{location}.json", "w") as file:
                    json.dump(log, file, indent=4)

                time.sleep(60)

    return users

In [None]:
dates = generate_date_intervals(interval_days=30)
users = fetch_users_from_github(location, dates, base_url, headers)

### Deduplicate users

In [None]:
import json 
location = "montreal"

with open(f"../data/users_in_{location}.json") as file:
    users = json.load(file)

user_ids = [user["id"] for user in users]
print(f"Total users: {len(users)}")
print(f"Total unique users: {len(set(user_ids))}")

if len(users) != len(set(user_ids)):
    duplicated_user_ids = []
    for user_id in set(user_ids):
        if user_ids.count(user_id) > 1:
            duplicated_user_ids.append(user_id)
    print(f"Total duplicated users: {len(duplicated_user_ids)}")

    # remove duplicated users
    if len(duplicated_user_ids) > 0:
        deduplicated_users = []
        seen_user_ids = set()

        for user in users:
            # If the user's ID hasn't been seen, add the user to the deduplicated list and mark the ID as seen
            # This will keep the order of the users list
            if user["id"] not in seen_user_ids:
                deduplicated_users.append(user)
                seen_user_ids.add(user["id"])

        with open(f"../data/users_in_{location}_deduplicated222.json", "w") as file:
            json.dump(deduplicated_users, file, indent=4)

        print(f"Total unique users after deduplication: {len(deduplicated_users)}")

Total users: 4614
Total unique users: 4495
Total duplicated users: 119
Total unique users after deduplication: 4495
