# Get a list of the last 1M repos from Github, load data into a warehouse, and run an exploratory data analysis 


## IMPORTING USEFUL LIBRARIES

In [1]:
import requests
import psycopg2
import json
import utils
from datetime import datetime

## CREATING DATABASE AND REQUIRED TABLES

In [2]:
## this is needed only when the database needs to be created 
try:
    fp= open("config.json")
    params  = json.load(fp)

    host = params['HOST']
    port = params['PORT']
    dbname = params['DB_NAME']
    username = params['USER']
    password = params['PASSWORD']
    print("Connecting to server")

    #establishing the connection
    conn = psycopg2.connect(
       database="postgres", user=username, password=password, host=host, port=port
    )
    conn.autocommit = True
    print("Connection to server established")
    
except:
    print("Failed to connect to server")
    
try:
    print("Creating database git repos")
    #Creating a cursor object using the cursor() method
    cursor = conn.cursor()

    #Preparing query to create a database
    sql = '''CREATE database '''+dbname;

    #Creating a database
    cursor.execute(sql)
    print("Database created successfully........")

    #Closing the connection
    conn.close()
except Exception as e:
    print(e)

Connecting to server
Connection to server established
Creating database git repos
database "gitrepos" already exists



In [3]:
## creating user table
conn = utils.connection()

#Creating a cursor object using the cursor() method
cursor = conn.cursor()

#Doping EMPLOYEE table if already exists.
cursor.execute("DROP TABLE IF EXISTS GITUSER")

#Creating table as per requirement
sql ='''CREATE TABLE GITUSER(
   USER_ID INT PRIMARY KEY,
   USER_NAME VARCHAR (20)
)'''
cursor.execute(sql)
print("Table created successfully........")
conn.commit()
#Closing the connection
conn.close()

Connecting to database
Successfully connected to database
Table created successfully........


In [4]:
## creating user table
conn = utils.connection()

#Creating a cursor object using the cursor() method
cursor = conn.cursor()

#Doping EMPLOYEE table if already exists.
cursor.execute("DROP TABLE IF EXISTS REPOS")

#Creating table as per requirement
sql ='''CREATE TABLE REPOS(
   REPO_ID INT PRIMARY KEY,
   USER_ID INT NOT NULL,
   REPONAME VARCHAR(200) NOT NULL,
   URL VARCHAR(200) NOT NULL,
   CREATED_AT TIMESTAMP,
   UPDATED_AT TIMESTAMP,
   PUSHED_AT TIMESTAMP,
   HAS_ISSUES BOOLEAN,
   HAS_DOWNLOADS BOOLEAN,
   OPEN_ISSUE_COUNT INT,
   SIZE INT,
   FORK BOOLEAN,
   FORK_COUNT INT,
   VISIBILITY CHAR(20),
   WATCHERS INT,
   LANGUAGE VARCHAR(20),
   DESCRIPTION VARCHAR(200)
)'''
cursor.execute(sql)
print("Table created successfully........")
conn.commit()
#Closing the connection
conn.close()

Connecting to database
Successfully connected to database
Table created successfully........


## FETCHING DATA FROM GIT REPOS

In [5]:
## at the time of writing this code, the number of github repos was found to be approx equal to 537843000
username = 'nerdykamil'
token = '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'

total_repos = 537843000
fetched_repos= 0


In [None]:
## fetching data of 1m repos
repo_count = total_repos
try:
    while fetched_repos < 1000000:
        res = requests.get('https://api.github.com/repositories?since='+str(repo_count), auth=(username,token))
        print("Fetched 100 records after id ",repo_count)
        if res.status_code == 200:
            for each in res.json():
                repo_name = each['full_name']
                repo_stats = requests.get('https://api.github.com/repos/'+repo_name, auth=(username,token))
                if repo_stats.status_code == 200:
                    utils.populate(repo_stats)
                    print("populated db with repo stats from ",repo_name)
                else:
                    print("Unable to execute, Request Code: ",repo_stats.status_code)

        else:
            print("Unable to execute, Request Code: ",res.status_code)
            print("Repo Count: ",repo_count)
            print("number of fetched repos: ",fetched_repos)
            break
        repo_count-=200
        fetched_repos+=len(res.json())
        print("Total fetched repos: ",fetched_repos)
except Exception as e:
    print(e)

Fetched 100 records after id  537843000
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  YSChoi123/antjump-test
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  zhangxq8692/opencv
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Sanyi81/hw2
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  bisherbh/healh-insurance-company
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  felipeLeao18/discord-pomodoro
Unable to execute, Request Code:  404
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  devhub-blue-sea-ap-so

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  zahraamirinezhad/Task-Manager
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Izytube/loan_app
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  roshanthink/AlphaZero_Gomoku
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  GargantuaX/flutter_distributor
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  radss5et/navegar
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  ccakiroglu/SCC
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  anylee2021/TiledLighting11
Connecti

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  esteimanuel/learn-godot
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  RoiMeshulam/OOP_Ex1
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  joimar755/TallerMecanica.App
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  PrismarinMC/.github
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  iiccttff/build_xlua_with_libs
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Shizaaaaa/CPP_is_favorite_language
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  BrendanShelton/w

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  otianelson/testing-repo
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  MaceMclain/lp-token-staking
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  faisalmahmudsojib/UMS
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  MajkiIT/Malware-IOCs
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  kebab03/spring
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Outsiders-AW-SI730/Web-Services-Tasty-Point
Connecting to database
Successfully connected to database
Records inserted........
populated db with

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Pamella30/projeto-barbearia
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  VBorisof/simple-debugger
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  01WarpDrive/marktext_try
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  christiantorres98/2022SEM2S3-flutter-1
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  cuong1310/aoquan
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  iamzehan/Mastering-OpenCV-with-python
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  ja

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Myfithub12/dasveenerschnitzel
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  shailiawasthi/ML-ExerciseProject
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  VoThiKimTrang06101997/TrelloClone
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  jefersonPMatos/vivahq
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  AKuchevskaya/frontend-robopractice-test-task
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  TahiruMustapha/finalProject
Connecting to database
Successfully connected to database
Reco

Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Random-World-Studio/Metaverse
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  MarianaVama/Tesis
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  DreVinciCode/PythonWithUnity
Unable to execute, Request Code:  404
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  YoussefGamall/clothes
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  hweij/webxr
Unable to execute, Request Code:  404
Unable to execute, Request Code:  404
Unable to execute, Request Code:  404
Connecting to database
Successful

Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  arsh360/Cyclic-Sort
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  stcchain/agent-cli
Connecting to database
Successfully connected to database
Unable to insert record 
value too long for type character varying(200)

populated db with repo stats from  DDX-BiswaRanjan/Voice-Assistant
Connecting to database
Successfully connected to database
Unable to insert record 
duplicate key value violates unique constraint "repos_pkey"
DETAIL:  Key (repo_id)=(537842924) already exists.

populated db with repo stats from  ziaualhassain/object_detection_motion_tracker
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  bmverse/Bm-verseFinal
Unable to execute, Request Code:  404
Connecting to database
Successfully

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  AngelinaAlorzuke/Autorization
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  OsheeSan/git
Total fetched repos:  499
Fetched 100 records after id  537842000
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  RicardoD92/Prep-Course
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  s1lencef/EVM
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  zerlyns/project-07-CRUD
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  FabioLoVetro/Mars-Rover-Kata
Connecting to database
Successfully connected to databas

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Rojhal/Roj
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  MTufekcioglu/Computational-Bio
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  khersi17/khersi17
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  NooodleFine/sw
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  danieldev23/huy
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  locus-amoenus/locus-amoenus.github.io
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  pm-pl/ItemSender
Unable to execute, Request C

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  haroldCoder/windowPY
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  BirdGurhl/Auction
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  dflomo1/my_DanieleFlomo_new_remote_repo
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  17-seventeen-13/utsuroi-t
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  sudarshan20222/final-year-project
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  yunjie-li/nas-tools
Unable to execute, Request Code:  404
Unable to execute, Request Code:  404
Unable to execute, Request Code:  404
Connecting to datab

Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  marcmaralou/leetcode
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  ashutoshtak/afk-bot-for-aternos
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  q1ly/q1ly.github.io
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  anylee2021/glTFSample
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Aaryash-Shakya/Calculator-BW
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  630511830/vyshu1
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats f