In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [None]:
username = 'username'
access_token = 'token'

base_url = 'https://api.github.com'

# Number of repositories per day
repositories_per_day = 500

# Number of days to scrape  
days_to_scrape = 60

# Calculate the start and end dates for scraping
end_date = datetime.now().date()
start_date = end_date - timedelta(days=days_to_scrape)

# Initialize the repositories list
repositories = []

# Retry mechanism for API requests
retry_strategy = Retry(
    total=3,
    backoff_factor=0.5,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET"],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)

# Variables for tracking total rows and time taken
total_rows = 0
total_time = timedelta()

# Iterate over each day
for day in range(days_to_scrape):
    current_date = start_date + timedelta(days=day)
    formatted_date = current_date.strftime('%Y-%m-%d')

    # Fetch repositories using pagination
    page = 1
    start_time = datetime.now()

    while len(repositories) < repositories_per_day * (day + 1):
        # Create the API URL to fetch repositories created on the current day and specific page
        url = f'{base_url}/search/repositories?q=created:{formatted_date}&sort=stars&order=desc&per_page=100&page={page}'

        # Make the API request with retry logic
        headers = {'Authorization': f'token {access_token}'} if access_token else {}
        response = http.get(url, headers=headers)
        data = response.json()

        if 'items' in data:
            # Extract repository information from the response
            items = data['items']
            sorted_items = sorted(items, key=lambda item: item['stargazers_count'], reverse=True)

            for item in sorted_items[:repositories_per_day]:
                repository = {
                    'id': item['id'],
                    'full_name': item['full_name'],
                    'url': item['html_url'],
                    'language': item.get('language', ''),
                    'license': item['license']['name'] if item['license'] else '',
                    'topics': item.get('topics', []),
                    'owner_type': item['owner']['type'],
                    'description': item['description'],
                    'stars_count': item['stargazers_count'],
                    'forks': item['forks'],
                    'issues_count': item['open_issues_count'],
                    'year': current_date.year, 
                    'created_at': item['created_at'],
                    'updated_at':item['updated_at'],
                    
                    
                }

                repositories.append(repository)

        page += 1

        if 'next' not in response.links:
            break

    end_time = datetime.now()
    time_taken = end_time - start_time
    total_time += time_taken

    rows_extracted = len(repositories) - total_rows
    total_rows = len(repositories)
    print(f"Day {day + 1}: Extracted {rows_extracted} rows in {time_taken}")

# Convert the repositories list to a DataFrame
df = pd.DataFrame(repositories)

# Select the desired columns
df = df[['id', 'full_name', 'url', 'language', 'license', 'topics',
         'owner_type', 'description', 'stars_count', 'forks', 'issues_count',
         'year', 'created_at', 'updated_at']]

row_count = len(df)
print(f"Number of non-duplicated rows: {row_count}")

csv_file_path = 'repositories.csv'
excel_file_path = 'repositories.xlsx'

df.to_csv(csv_file_path, index=False, encoding='utf-8-sig')
df.to_excel(excel_file_path, index=False)

print(f"Data exported to {csv_file_path} and {excel_file_path} successfully.")
print(f"Total rows extracted: {total_rows}")
print(f"Total time taken: {total_time}")

In [None]:
# Convert the repositories list to a DataFrame
df = pd.DataFrame(repositories)

# Select the desired columns
df = df[['id', 'full_name', 'url', 'language', 'license', 'topics',
         'owner_type', 'description', 'stars_count', 'forks', 'issues_count',
         'year', 'created_at', 'updated_at']]

row_count = len(df)
print(f"Number of non-duplicated rows: {row_count}")

csv_file_path = 'repositories.csv'
excel_file_path = 'repositories.xlsx'

df.to_csv(csv_file_path, index=False, encoding='utf-8-sig')
df.to_excel(excel_file_path, index=False)

In [None]:
df= pd.read_csv('repositories.csv')
df.head()

In [None]:
df.info()

In [None]:
df.columns