# Get a list of the last 1M repos from Github, load data into a warehouse, and run an exploratory data analysis 


## IMPORTING USEFUL LIBRARIES

In [1]:
import requests
import psycopg2
import json
import utils
from datetime import datetime
import utils
import pandas as pd

## CREATING DATABASE AND REQUIRED TABLES

In [2]:
## this is needed only when the database needs to be created 
try:
    fp= open("config.json")
    params  = json.load(fp)

    host = params['HOST']
    port = params['PORT']
    dbname = params['DB_NAME']
    username = params['USER']
    password = params['PASSWORD']
    print("Connecting to server")

    #establishing the connection
    conn = psycopg2.connect(
       database="postgres", user=username, password=password, host=host, port=port
    )
    conn.autocommit = True
    print("Connection to server established")
    
except:
    print("Failed to connect to server")
    
try:
    print("Creating database git repos")
    #Creating a cursor object using the cursor() method
    cursor = conn.cursor()

    #Preparing query to create a database
    sql = '''CREATE database '''+dbname;

    #Creating a database
    cursor.execute(sql)
    print("Database created successfully........")

    #Closing the connection
    conn.close()
except Exception as e:
    print(e)

Connecting to server
Connection to server established
Creating database git repos
database "gitrepos" already exists



In [3]:
## creating user table
conn = utils.connection()

#Creating a cursor object using the cursor() method
cursor = conn.cursor()

#Doping EMPLOYEE table if already exists.
cursor.execute("DROP TABLE IF EXISTS GITUSER")

#Creating table as per requirement
sql ='''CREATE TABLE GITUSER(
   USER_ID INT PRIMARY KEY,
   USER_NAME VARCHAR (20)
)'''
cursor.execute(sql)
print("Table created successfully........")
conn.commit()
#Closing the connection
conn.close()

Connecting to database
Successfully connected to database
Table created successfully........


In [4]:
## creating repos table
conn = utils.connection()

#Creating a cursor object using the cursor() method
cursor = conn.cursor()

#Doping EMPLOYEE table if already exists.
cursor.execute("DROP TABLE IF EXISTS REPOS")

#Creating table as per requirement
sql ='''CREATE TABLE REPOS(
   REPO_ID INT PRIMARY KEY,
   USER_ID INT NOT NULL,
   REPONAME VARCHAR(200) NOT NULL,
   URL VARCHAR(200) NOT NULL,
   CREATED_AT TIMESTAMP,
   UPDATED_AT TIMESTAMP,
   PUSHED_AT TIMESTAMP,
   HAS_ISSUES BOOLEAN,
   HAS_DOWNLOADS BOOLEAN,
   OPEN_ISSUE_COUNT INT,
   SIZE INT,
   FORK BOOLEAN,
   FORK_COUNT INT,
   VISIBILITY CHAR(20),
   WATCHERS INT,
   LANGUAGE VARCHAR(20),
   DESCRIPTION VARCHAR(200)
)'''
cursor.execute(sql)
print("Table created successfully........")
conn.commit()
#Closing the connection
conn.close()

Connecting to database
Successfully connected to database
Table created successfully........


## FETCHING DATA FROM GIT REPOS

In [6]:
## at the time of writing this code, the number of github repos was found to be approx equal to 537843000
username = 'nerdykamil'
token = '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'

total_repos = 537820000
fetched_repos= 0


In [7]:
## fetching data of 1m repos
repo_count = total_repos
try:
    while fetched_repos < 1000000:
        res = requests.get('https://api.github.com/repositories?since='+str(repo_count), auth=(username,token))
        print("Fetched 100 records after id ",repo_count)
        if res.status_code == 200:
            for each in res.json():
                repo_name = each['full_name']
                repo_stats = requests.get('https://api.github.com/repos/'+repo_name, auth=(username,token))
                if repo_stats.status_code == 200:
                    utils.populate(repo_stats)
                    print("populated db with repo stats from ",repo_name)
                else:
                    print("Unable to execute, Request Code: ",repo_stats.status_code)

        else:
            print("Unable to execute, Request Code: ",res.status_code)
            print("Repo Count: ",repo_count)
            print("number of fetched repos: ",fetched_repos)
            break
        repo_count-=200
        fetched_repos+=len(res.json())
        print("Total fetched repos: ",fetched_repos)
except Exception as e:
    print(e)

Fetched 100 records after id  537820000
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Lesley-w19/AirBnB_clone
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  NicolasBugari/PreEntrega-2-Bugari
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  adiket-dev/Kirshanal
Unable to execute, Request Code:  404
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  netson99/powerdevil
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  sesunda/cicd-pipeline-train-schedule-dockerdeploy
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records ins

Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  swws16/repo-exercise
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Larrygold17/whatsapp-bot-md
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  mkosi/redmine_issue_templates
Unable to execute, Request Code:  404
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  rolmos14/ReadQualityControl
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  kniranjan0/AkshayaAnnadurai
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  alexiusbond/gropr_22
Connecting to database

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Oluwapelumi14/Techrity-calculator
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  AlexFeiFuni/PaddleHub
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  GitMoreDeMore/Packer-vSphere-Jenkins
Connecting to database
Successfully connected to database
Unable to insert record 
syntax error at or near "s"
LINE 2: ...PURPOSES!!!! - Allows you to troll computer, that's running ...
                                                             ^

populated db with repo stats from  Olafcio1/FastCommands
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  nehamahey/Personal-assistant
Unable to execute, Request Code:  404
Connecting to database


Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  NihalManiyar/Quiz-Game
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  magic-akshay/first-repo-from-terraform
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  CibelleSena/on19-tet-s6-api-I
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  CaioDetore/nlw-esports
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Asoaresdev/NLW2022_Ignite
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  JibsonJoy/FakeNews_classificstion
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from 

Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Sanggoe/udemy-react
Connecting to database
Successfully connected to database
Unable to insert record 
duplicate key value violates unique constraint "repos_pkey"
DETAIL:  Key (repo_id)=(537837145) already exists.

populated db with repo stats from  ValerkaMalinka1998/REVOOO
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  U1song/C_2022
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Damus765/harbour-meteoswiss
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  ROHIT922PAL/mistore-clone.github.io
Connecting to database
Successfully connected to database
Records inserted........
populated db wi

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  samerhassan95/weather-app
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  toxicsphynx/jkorasite
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  devhub-blue-sea-ap-northeast-1/org-public-empty-csc-repo-ap-northeast-1
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  HeyIamMarasiniAmrit/Student-Management-System.c
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  C-SIN-IN2028-001-2213/ej-base
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  codestar-github-bot-ap-northeast-1/user-public-seeded-csc-repo-ap-northeast-1
Connecting to da

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  xpsun68/PyramidStore
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Sai-Thet-Naung-Oo/Easy-POS---Point-of-Sale
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  fknol309/ic210_daily
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Swathikrishnatu/week15-activity-on-titanic-data
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  ogodeverest/password-generator
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  nehamahey/nehamahey
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo st

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  VostanieKotov/DE101-mod2-SQL
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  yayaa2140/chang
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  anotherAsad/Miscellaneous
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Mellurboo/imanidiot.online
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  sharifhsn/hexyl
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  17SenW/gitskills
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  1010as/new_work_c-
Connecting to database
S

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  njgccx/Yunzai-Bot
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  VeronikaG97/the-git-rocks
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  pastaboy2131/hTx-Report-Bot
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  RodrigoSoca55/tienda-partes-auto-soca
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  ferozahd/queen
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  hostfilter/hostfilter.github.io
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  daicarreno11/PreE

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  tcut-sci-tech/aigirisai-reserve
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  kiet-kun/DoAnPhanMemHuongDoiTuong
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  4lmeida/devhello
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  CmderQ/learning_tools
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  AvinashChowdar/AvinashChowdar
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Luis2088/diveIntoDeepLearning
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  dts-komin

Connecting to database
Successfully connected to database
Unable to insert record 
duplicate key value violates unique constraint "repos_pkey"
DETAIL:  Key (repo_id)=(537819214) already exists.

populated db with repo stats from  kosohae/KOAS
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  rakesh291/webcode-01
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Thirunthaiyn/upload-file-with-spring-boot
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  yrnmsk/pinoy-made
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  selv0/Microsoft---Beginners-problems
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  vusal5555/a

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  abitsalihu/quizafriend
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  pynix890/git-web-111-
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  belalkhandev/app-firebase-push-notification
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  ramaherdianto/quran-app
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  RajSuriyan/IBM-Project-13132-1659511681
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  ali-allaw/Responsive-design
Connecting to database
Successfully

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Yerardinego/2022SEM2S3-flutter-1
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  XT-Lab/BlazorAppTest
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  ianyeahhhh/trabahunt-deployment-api
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  iasrarov/ecommerceproject
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  OliverRietmann/eigenfaces_latex
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  tmalikJK14/mern-app
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to

Fetched 100 records after id  537818400
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  OpkSharma/HousePricePrediction-
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  PHANIBABU1576/gittags
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Waso-python/PiscinePythonDataScience
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Ganeshpatro7799/MyDevops
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Getzman450/Getzman
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Anggellina/2226833-kekstagram-1
Connecting to database
Successfully connected to database
Records inserted........
po

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  viditpushpanjay/python-sample-vscode-flask-tutorial
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  adrogin/ALAppExtensions
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  rojhanzyy/roziiideneme123
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  AEtherstrata/MultiMC
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  cg20221d/tugas-webgl-angelaoryza
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Viennablue/Loungewell
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats fr

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  aurum447/kata_p-p_1.1.4
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  devhub-blue-sea-ap-southeast-1/org-public-empty-csc-repo-ap-southeast-1
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Msar223/devpack
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  ItsSwaggy/docs
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  devhub-blue-sea-ca-central-1/org-public-empty-csc-repo-ca-central-1
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  khalidkhalil4254/simple-express
Connecting to database
Succ

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  akshay-bhosale1/TestProjectUser
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Creditedking360/creditedking360.github.io
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  BeLinKang/django_movie_drf
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Dovalight/P7_Esquinazi_Hugo_v2
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Skorpiyo/stage0-module3-task4
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Guillo987/desafioComplementarioClaseOcho
Connecting to database
Successfully connected to database
Records inserted........
popula

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Selvanayagi/Online-Shopping
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  verifyreportlabreportsmbmlabcomresult/GetByBarcode-clientCode-MBMLAB-resultId-1000019289670U6MPC55AWBQHRDP7L8YC5G3N14MM5FWR67UJCP9J3LSP18P
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  MikeTheSapien/markdown-diff
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  tvthanhcod/qt-shop-front
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  VitorRomanelli/ChatFront
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
popula

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  CHOIisaac/CHOIisaac
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  KYJKY/VueTest
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  tao-seu-309/WebServer-cat
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  danieldevries/pokemon-go-gbl-schedule
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Harikaraja/Tribute-Page
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  RaoHamza6/DogeBot
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats fr

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  ntkien0502/mshop
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  anx6uem/QuantumultX-jiumeiquanX
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  omrctnr/tubitak2209-project
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  EliskaPokorna/Dobrovolny_ukol_margin_padding
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  scudscud/TP6-OC-lapiiquante
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Wisula/ForceSub_Bot
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  AVam

Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  ahmetarifakozz/benim-mavi-tiklerim
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  areyutyk/areyutyk
Unable to execute, Request Code:  404
Unable to execute, Request Code:  404
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  Durdona0501/movies-api-exam
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  nageabhii2022/Fly-Learn
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  akjandl/testing-flask-sqlalchemy
Connecting to database
Successfully connected to database
Records inserted........
populated db with repo stats from  brenodsb/extensao_iot_doorbell
Connecting to database
Successfully connected t