In [1]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [6]:


username = 'user'
access_token = 'token'

base_url = 'https://api.github.com'

# Number of repositories per day
repositories_per_day = 500

# Number of days to scrape  
days_to_scrape = 150

# Calculate the start and end dates for scraping
end_date = datetime.now().date()
start_date = end_date - timedelta(days=days_to_scrape)

# Initialize the repositories list
repositories = []

# Retry mechanism for API requests
retry_strategy = Retry(
    total=3,
    backoff_factor=0.5,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET"],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)

# Variables for tracking total rows and time taken
total_rows = 0
total_time = timedelta()

# Iterate over each day
for day in range(days_to_scrape):
    current_date = start_date + timedelta(days=day)
    formatted_date = current_date.strftime('%Y-%m-%d')

    # Fetch repositories using pagination
    page = 1
    start_time = datetime.now()

    while len(repositories) < repositories_per_day * (day + 1):
        # Create the API URL to fetch repositories created on the current day and specific page
        url = f'{base_url}/search/repositories?q=created:{formatted_date}&sort=stars&order=desc&per_page=100&page={page}'

        # Make the API request with retry logic
        headers = {'Authorization': f'token {access_token}'} if access_token else {}
        response = http.get(url, headers=headers)
        data = response.json()

        if 'items' in data:
            # Extract repository information from the response
            items = data['items']
            sorted_items = sorted(items, key=lambda item: item['stargazers_count'], reverse=True)

            for item in sorted_items[:repositories_per_day]:
                repository = {
                    'id': item['id'],
                    'full_name': item['full_name'],
                    'url': item['html_url'],
                    'stargazers_count': item['stargazers_count'],
                    'language': item.get('language', ''),
                    'license': item['license']['name'] if item['license'] else '',
                    'topics': item.get('topics', []),
                    'forks': item['forks'],
                    'issues_count': item['open_issues_count'],
                    'year': current_date.year,
                    'watchers_count': item['watchers_count'],  
                    'created_at': item['created_at'],  
                }

                repositories.append(repository)

        page += 1

        if 'next' not in response.links:
            break

    end_time = datetime.now()
    time_taken = end_time - start_time
    total_time += time_taken

    rows_extracted = len(repositories) - total_rows
    total_rows = len(repositories)
    print(f"Day {day + 1}: Extracted {rows_extracted} rows in {time_taken}")

# Convert the repositories list to a DataFrame
df = pd.DataFrame(repositories)

# Select the desired columns
df = df[['id', 'full_name', 'url', 'stargazers_count', 'language', 'license', 'topics',
         'forks', 'issues_count', 'year', 'watchers_count', 'created_at']]

row_count = len(df)
print(f"Number of non-duplicated rows: {row_count}")

csv_file_path = 'repositories.csv'
excel_file_path = 'repositories.xlsx'

df.to_csv(csv_file_path, index=False, encoding='utf-8-sig')
df.to_excel(excel_file_path, index=False)

print(f"Data exported to {csv_file_path} and {excel_file_path} successfully.")
print(f"Total rows extracted: {total_rows}")
print(f"Total time taken: {total_time}")


Day 1: Extracted 500 rows in 0:00:14.436937
Day 2: Extracted 500 rows in 0:00:14.318417
Day 3: Extracted 500 rows in 0:00:15.169040
Day 4: Extracted 500 rows in 0:00:14.015947
Day 5: Extracted 500 rows in 0:00:14.034839
Day 6: Extracted 500 rows in 0:00:14.872829
Day 7: Extracted 500 rows in 0:00:15.454762
Day 8: Extracted 500 rows in 0:00:15.248799
Day 9: Extracted 500 rows in 0:00:14.757725
Day 10: Extracted 500 rows in 0:00:14.009634
Day 11: Extracted 500 rows in 0:00:15.144564
Day 12: Extracted 500 rows in 0:00:14.444195
Day 13: Extracted 500 rows in 0:00:14.236996
Day 14: Extracted 500 rows in 0:00:14.456851
Day 15: Extracted 500 rows in 0:00:14.116011
Day 16: Extracted 500 rows in 0:00:17.512918
Day 17: Extracted 500 rows in 0:00:15.352903
Day 18: Extracted 500 rows in 0:00:15.769559
Day 19: Extracted 500 rows in 0:00:13.519005
Day 20: Extracted 500 rows in 0:00:14.752320
Day 21: Extracted 500 rows in 0:00:15.574189
Day 22: Extracted 500 rows in 0:00:14.529985
Day 23: Extracted 5

In [3]:
df= pd.read_csv('repositories.csv')
df.head()

Unnamed: 0,id,full_name,url,stargazers_count,language,license,topics,forks,issues_count,year,watchers_count,created_at
0,589831718,comfyanonymous/ComfyUI,https://github.com/comfyanonymous/ComfyUI,6157,Python,GNU General Public License v3.0,['stable-diffusion'],553,271,2023,6157,2023-01-17T03:15:56Z
1,590069860,Mokshit06/typewind,https://github.com/Mokshit06/typewind,2070,TypeScript,MIT License,[],24,12,2023,2070,2023-01-17T15:38:59Z
2,589860087,karpathy/ng-video-lecture,https://github.com/karpathy/ng-video-lecture,1999,Python,,[],468,15,2023,1999,2023-01-17T05:27:03Z
3,590112111,index-labs/readpilot,https://github.com/index-labs/readpilot,1081,TypeScript,MIT License,"['gpt3', 'nextjs', 'openai', 'react', 'tailwin...",67,5,2023,1081,2023-01-17T17:24:08Z
4,589830814,easychen/CookieCloud,https://github.com/easychen/CookieCloud,719,JavaScript,GNU General Public License v3.0,[],69,12,2023,719,2023-01-17T03:11:37Z


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75096 entries, 0 to 75095
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                75096 non-null  int64 
 1   full_name         75096 non-null  object
 2   url               75096 non-null  object
 3   stargazers_count  75096 non-null  int64 
 4   language          64274 non-null  object
 5   license           36970 non-null  object
 6   topics            75096 non-null  object
 7   forks             75096 non-null  int64 
 8   issues_count      75096 non-null  int64 
 9   year              75096 non-null  int64 
 10  watchers_count    75096 non-null  int64 
 11  created_at        75096 non-null  object
dtypes: int64(6), object(6)
memory usage: 6.9+ MB


In [4]:
df.columns

Index(['id', 'full_name', 'url', 'stargazers_count', 'language', 'license',
       'topics', 'forks', 'issues_count', 'year', 'watchers_count',
       'created_at'],
      dtype='object')