In [1]:
import requests
import re
import json
from datetime import datetime
from typing import List, Dict, Optional

In [2]:
class GitHubDataFetcher:
    """Fetches and filters data files from GitHub repository."""
    
    def __init__(self, repo_owner: str = "owlmaps", repo_name: str = "map-data", data_path: str = "data"):
        """
        Initialize the fetcher.
        
        Args:
            repo_owner: GitHub repository owner
            repo_name: GitHub repository name
            data_path: Path to data directory in the repository
        """
        self.repo_owner = repo_owner
        self.repo_name = repo_name
        self.data_path = data_path
        self.api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{data_path}"
        self.raw_base_url = f"https://raw.githubusercontent.com/{repo_owner}/{repo_name}/master/{data_path}/"
        self.date_pattern = re.compile(r'^(\d{8})\.json$')
    
    def check_accessibility(self) -> bool:
        """
        Check if the GitHub API is accessible.
        
        Returns:
            bool: True if accessible, False otherwise
        """
        print("\n" + "=" * 80)
        print("STEP 1: Checking GitHub API accessibility...")
        print("=" * 80)
        
        try:
            response = requests.get(self.api_url, timeout=10)
            if response.status_code == 200:
                print(f"GitHub API is accessible (Status: {response.status_code})")
                print(f"  URL: {self.api_url}")
                return True
            else:
                print(f"GitHub API returned status code: {response.status_code}")
                print(f"  Response: {response.text[:200]}")
                return False
        except requests.exceptions.ConnectionError as e:
            print("Connection error: Unable to reach GitHub API")
            print(f"  Error: {str(e)[:200]}")
            return False
        except requests.exceptions.Timeout:
            print("equest timed out after 10 seconds")
            return False
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
            return False
    
    def fetch_directory_contents(self) -> Optional[List[Dict]]:
        """
        Fetch directory contents from GitHub API.
        
        Returns:
            List of file information dictionaries, or None if failed
        """
        print("\n" + "=" * 80)
        print("STEP 2: Fetching directory contents...")
        print("=" * 80)
        
        try:
            response = requests.get(self.api_url, timeout=10)
            response.raise_for_status()
            
            files_data = response.json()
            print("Successfully fetched directory contents")
            print(f"  Total items found: {len(files_data)}")
            
            return files_data
        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch directory: {e}")
            return None
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON response: {e}")
            return None
    
    def filter_and_parse_files(self, files_data: List[Dict]) -> List[Dict]:
        """
        Filter JSON files with date pattern and parse their dates.
        
        Args:
            files_data: List of file information from GitHub API
            
        Returns:
            List of parsed file information with dates
        """
        print("\n" + "=" * 80)
        print("STEP 3: Filtering and parsing JSON files...")
        print("=" * 80)
        
        json_files = []
        skipped_files = []
        
        for item in files_data:
            if item.get('type') == 'file':
                filename = item.get('name', '')
                match = self.date_pattern.match(filename)
                
                if match:
                    date_str = match.group(1)
                    try:
                        # Parse date to ensure it's valid
                        file_date = datetime.strptime(date_str, '%Y%m%d')
                        json_files.append({
                            'filename': filename,
                            'date': file_date,
                            'date_str': date_str,
                            'download_url': item.get('download_url'),
                            'size': item.get('size', 0),
                            'sha': item.get('sha', '')
                        })
                    except ValueError:
                        skipped_files.append(filename)
        
        print(f"Found {len(json_files)} JSON files with valid date names")
        if skipped_files:
            print(f"Skipped {len(skipped_files)} files with invalid dates:")
            for fname in skipped_files[:5]:
                print(f"    - {fname}")
        
        return json_files
    
    def sort_by_date(self, json_files: List[Dict], descending: bool = True) -> List[Dict]:
        """
        Sort files by date.
        
        Args:
            json_files: List of file information with dates
            descending: If True, sort newest first; if False, oldest first
            
        Returns:
            Sorted list of files
        """
        print("\n" + "=" * 80)
        print("STEP 4: Sorting files by date...")
        print("=" * 80)
        
        sorted_files = sorted(json_files, key=lambda x: x['date'], reverse=descending)
        
        order = "newest to oldest" if descending else "oldest to newest"
        print(f"Sorted {len(sorted_files)} files ({order})")
        
        return sorted_files
    
    def get_recent_files(self, sorted_files: List[Dict], n: int = 5) -> List[Dict]:
        """
        Get the N most recent files.
        
        Args:
            sorted_files: List of sorted file information
            n: Number of recent files to retrieve
            
        Returns:
            List of N most recent files
        """
        print("\n" + "=" * 80)
        print(f"STEP 5: Selecting {n} most recent files...")
        print("=" * 80)
        
        recent_files = sorted_files[:n]
        
        print(f"✓ Selected {len(recent_files)} files:")
        print(f"\n{'#':<4} {'Date':<12} {'Filename':<20} {'Size (KB)':<12}")
        print("-" * 80)
        
        for idx, file_info in enumerate(recent_files, 1):
            size_kb = file_info['size'] / 1024
            print(f"{idx:<4} {file_info['date'].strftime('%Y-%m-%d'):<12} {file_info['filename']:<20} {size_kb:>10.1f}")
        
        return recent_files
    
    def display_summary(self, all_files: List[Dict]):
        """Display summary statistics."""
        print("\n" + "=" * 80)
        print("STEP 6: Summary Statistics")
        print("=" * 80)
        
        if not all_files:
            print("No files to summarize")
            return
        
        oldest = all_files[-1]
        newest = all_files[0]
        total_size_mb = sum(f['size'] for f in all_files) / (1024 * 1024)
        avg_size_mb = total_size_mb / len(all_files)
        
        print(f"Total JSON files: {len(all_files)}")
        print(f"Date range: {oldest['date'].strftime('%Y-%m-%d')} to {newest['date'].strftime('%Y-%m-%d')}")
        print(f"Days covered: {(newest['date'] - oldest['date']).days}")
        print(f"Total size: {total_size_mb:.2f} MB")
        print(f"Average file size: {avg_size_mb:.2f} MB")
    
    def fetch_and_filter(self, num_recent: int = 5) -> Optional[List[Dict]]:
        """
        Main method to fetch and filter recent files.
        
        Args:
            num_recent: Number of most recent files to return
            
        Returns:
            List of most recent files, or None if failed
        """
        print("\n" + "=" * 80)
        print("GITHUB DATA FETCHER - owlmaps/map-data")
        print("=" * 80)
        print(f"Target: {self.repo_owner}/{self.repo_name}/{self.data_path}")
        print(f"Fetching {num_recent} most recent files...")
        
        # Step 1: Check accessibility
        if not self.check_accessibility():
            return None
        
        # Step 2: Fetch directory contents
        files_data = self.fetch_directory_contents()
        if files_data is None:
            return None
        
        # Step 3: Filter and parse files
        json_files = self.filter_and_parse_files(files_data)
        if not json_files:
            print("\nNo valid JSON files found!")
            return None
        
        # Step 4: Sort by date
        sorted_files = self.sort_by_date(json_files, descending=True)
        
        # Step 5: Get recent files
        recent_files = self.get_recent_files(sorted_files, num_recent)
        
        # Step 6: Display summary
        self.display_summary(sorted_files)
        
        print("\n" + "=" * 80)
        print("FETCH COMPLETE - Files ready for processing")
        print("=" * 80)
        
        return recent_files

In [3]:
def download_file(file_info: Dict, save_path: str) -> bool:
    """
    Download a file from GitHub.
    
    Args:
        file_info: File information dictionary with download_url
        save_path: Path where to save the file
        
    Returns:
        bool: True if successful, False otherwise
    """
    try:
        response = requests.get(file_info['download_url'], timeout=30)
        response.raise_for_status()
        
        with open(save_path, 'w') as f:
            f.write(response.text)
        
        print(f"Downloaded: {file_info['filename']} -> {save_path}")
        return True
    except Exception as e:
        print(f"Failed to download {file_info['filename']}: {e}")
        return False

In [4]:
def main():
    """Main execution function."""
    # Initialize fetcher
    fetcher = GitHubDataFetcher()
    
    # Fetch and filter recent files
    recent_files = fetcher.fetch_and_filter(num_recent=5)
    
    if recent_files:
        print("\n" + "=" * 80)
        print("RECENT FILES STORED IN VARIABLE")
        print("=" * 80)
        print("\nYou can now process these files:")
        print("Variable: recent_files")
        print(f"Type: list of {len(recent_files)} dictionaries")
        print("\nEach dictionary contains:")
        print("  - filename: str")
        print("  - date: datetime object")
        print("  - date_str: str (YYYYMMDD)")
        print("  - download_url: str")
        print("  - size: int (bytes)")
        print("  - sha: str (git hash)")
        
        # Example: Print download URLs
        print("\n" + "=" * 80)
        print("DOWNLOAD URLs for recent files:")
        print("=" * 80)
        for file_info in recent_files:
            print(f"{file_info['filename']}: {file_info['download_url']}")
        
        return recent_files
    else:
        print("\nFailed to fetch recent files")
        return None

In [5]:
if __name__ == "__main__":
    recent_files = main()



GITHUB DATA FETCHER - owlmaps/map-data
Target: owlmaps/map-data/data
Fetching 5 most recent files...

STEP 1: Checking GitHub API accessibility...
GitHub API is accessible (Status: 200)
  URL: https://api.github.com/repos/owlmaps/map-data/contents/data

STEP 2: Fetching directory contents...
Successfully fetched directory contents
  Total items found: 1000

STEP 3: Filtering and parsing JSON files...
Found 1000 JSON files with valid date names

STEP 4: Sorting files by date...
Sorted 1000 files (newest to oldest)

STEP 5: Selecting 5 most recent files...
✓ Selected 5 files:

#    Date         Filename             Size (KB)   
--------------------------------------------------------------------------------
1    2025-09-01   20250901.json             132.3
2    2025-08-31   20250831.json             126.1
3    2025-08-30   20250830.json             132.7
4    2025-08-29   20250829.json             126.3
5    2025-08-28   20250828.json             129.9

STEP 6: Summary Statistics
Total 