In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [2]:


username = 'username'
access_token = 'token'

base_url = 'https://api.github.com'

# Number of repositories per day
repositories_per_day = 500

# Number of days to scrape  
days_to_scrape = 60

# Calculate the start and end dates for scraping
end_date = datetime.now().date()
start_date = end_date - timedelta(days=days_to_scrape)

# Initialize the repositories list
repositories = []

# Retry mechanism for API requests
retry_strategy = Retry(
    total=3,
    backoff_factor=0.5,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET"],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)

# Variables for tracking total rows and time taken
total_rows = 0
total_time = timedelta()

# Iterate over each day
for day in range(days_to_scrape):
    current_date = start_date + timedelta(days=day)
    formatted_date = current_date.strftime('%Y-%m-%d')

    # Fetch repositories using pagination
    page = 1
    start_time = datetime.now()

    while len(repositories) < repositories_per_day * (day + 1):
        # Create the API URL to fetch repositories created on the current day and specific page
        url = f'{base_url}/search/repositories?q=created:{formatted_date}&sort=stars&order=desc&per_page=100&page={page}'

        # Make the API request with retry logic
        headers = {'Authorization': f'token {access_token}'} if access_token else {}
        response = http.get(url, headers=headers)
        data = response.json()

        if 'items' in data:
            # Extract repository information from the response
            items = data['items']
            sorted_items = sorted(items, key=lambda item: item['stargazers_count'], reverse=True)

            for item in sorted_items[:repositories_per_day]:
                repository = {
                    'id': item['id'],
                    'full_name': item['full_name'],
                    'url': item['html_url'],
                    'language': item.get('language', ''),
                    'license': item['license']['name'] if item['license'] else '',
                    'topics': item.get('topics', []),
                    'owner_type': item['owner']['type'],
                    'description': item['description'],
                    'stars_count': item['stargazers_count'],
                    'forks': item['forks'],
                    'issues_count': item['open_issues_count'],
                    'year': current_date.year, 
                    'created_at': item['created_at'],
                    'updated_at':item['updated_at'],
                    
                    
                }

                repositories.append(repository)

        page += 1

        if 'next' not in response.links:
            break

    end_time = datetime.now()
    time_taken = end_time - start_time
    total_time += time_taken

    rows_extracted = len(repositories) - total_rows
    total_rows = len(repositories)
    print(f"Day {day + 1}: Extracted {rows_extracted} rows in {time_taken}")

# Convert the repositories list to a DataFrame
df = pd.DataFrame(repositories)

# Select the desired columns
df = df[['id', 'full_name', 'url', 'language', 'license', 'topics',
         'owner_type', 'description', 'stars_count', 'forks', 'issues_count',
         'year', 'created_at', 'updated_at']]

row_count = len(df)
print(f"Number of non-duplicated rows: {row_count}")

csv_file_path = 'repositories.csv'
excel_file_path = 'repositories.xlsx'

df.to_csv(csv_file_path, index=False, encoding='utf-8-sig')
df.to_excel(excel_file_path, index=False)

print(f"Data exported to {csv_file_path} and {excel_file_path} successfully.")
print(f"Total rows extracted: {total_rows}")
print(f"Total time taken: {total_time}")


Day 1: Extracted 500 rows in 0:00:14.105064
Day 2: Extracted 500 rows in 0:00:13.397390
Day 3: Extracted 500 rows in 0:00:13.933405
Day 4: Extracted 500 rows in 0:00:12.632396
Day 5: Extracted 500 rows in 0:00:14.297353
Day 6: Extracted 500 rows in 0:00:13.110885
Day 7: Extracted 500 rows in 0:00:12.589807
Day 8: Extracted 500 rows in 0:00:12.898556
Day 9: Extracted 500 rows in 0:00:13.432094
Day 10: Extracted 500 rows in 0:00:13.386045
Day 11: Extracted 500 rows in 0:00:13.330287
Day 12: Extracted 500 rows in 0:00:12.997119
Day 13: Extracted 500 rows in 0:00:14.130611
Day 14: Extracted 500 rows in 0:00:13.523710
Day 15: Extracted 500 rows in 0:00:13.015229
Day 16: Extracted 500 rows in 0:00:15.654647
Day 17: Extracted 500 rows in 0:00:13.247616
Day 18: Extracted 500 rows in 0:00:14.204691
Day 19: Extracted 500 rows in 0:00:13.931131
Day 20: Extracted 500 rows in 0:00:13.639479
Day 21: Extracted 500 rows in 0:00:13.086155
Day 22: Extracted 500 rows in 0:00:15.970818
Day 23: Extracted 5

In [12]:
# Convert the repositories list to a DataFrame
df = pd.DataFrame(repositories)

# Select the desired columns
df = df[['id', 'full_name', 'url', 'language', 'license', 'topics',
         'owner_type', 'description', 'stars_count', 'forks', 'issues_count',
         'year', 'created_at', 'updated_at']]

row_count = len(df)
print(f"Number of non-duplicated rows: {row_count}")

csv_file_path = 'repositories.csv'
excel_file_path = 'repositories.xlsx'

df.to_csv(csv_file_path, index=False, encoding='utf-8-sig')
df.to_excel(excel_file_path, index=False)

Number of non-duplicated rows: 75096


In [5]:
df= pd.read_csv('repositories.csv')
df.head()

Unnamed: 0,id,full_name,url,language,license,topics,owner_type,description,stars_count,forks,issues_count,year,created_at,updated_at
0,592489166,openai/evals,https://github.com/openai/evals,Python,MIT License,[],Organization,Evals is a framework for evaluating LLMs and L...,10751,2061,144,2023,2023-01-23T20:51:04Z,2023-06-22T06:58:56Z
1,592351704,3cqs-coder/SymBot,https://github.com/3cqs-coder/SymBot,JavaScript,Other,"['cryptocurrencies', 'cryptocurrency', 'dca', ...",User,"SymBot is a user friendly, self-hosted and aut...",810,80,1,2023,2023-01-23T14:49:56Z,2023-06-22T07:39:55Z
2,592373929,sidpalas/devops-directive-docker-course,https://github.com/sidpalas/devops-directive-d...,Roff,,[],User,Companion repo for complete Docker course,665,169,2,2023,2023-01-23T15:42:41Z,2023-06-20T14:47:25Z
3,592283952,daeuniverse/dae,https://github.com/daeuniverse/dae,Go,GNU Affero General Public License v3.0,"['dae', 'ebpf', 'proxy', 'transparent-proxy']",Organization,A Linux high-performance transparent proxy sol...,609,36,19,2023,2023-01-23T11:53:16Z,2023-06-19T16:22:35Z
4,592129720,Octoberfest7/Inline-Execute-PE,https://github.com/Octoberfest7/Inline-Execute-PE,C,Apache License 2.0,[],User,Execute unmanaged Windows executables in Cobal...,462,81,0,2023,2023-01-23T01:29:01Z,2023-06-22T03:56:31Z


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75096 entries, 0 to 75095
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                75096 non-null  int64 
 1   full_name         75096 non-null  object
 2   url               75096 non-null  object
 3   stargazers_count  75096 non-null  int64 
 4   language          64274 non-null  object
 5   license           36970 non-null  object
 6   topics            75096 non-null  object
 7   forks             75096 non-null  int64 
 8   issues_count      75096 non-null  int64 
 9   year              75096 non-null  int64 
 10  watchers_count    75096 non-null  int64 
 11  created_at        75096 non-null  object
dtypes: int64(6), object(6)
memory usage: 6.9+ MB


In [4]:
df.columns

Index(['id', 'full_name', 'url', 'stargazers_count', 'language', 'license',
       'topics', 'forks', 'issues_count', 'year', 'watchers_count',
       'created_at'],
      dtype='object')