In [None]:
# Cell 1: imports & config
import re
import time
import requests
from urllib.parse import quote_plus
import pandas as pd

# ⚠️ Hardcoded GitHub Token (for testing only)
GITHUB_TOKEN = ""  # replace with your token
HEADERS = {"Accept": "application/vnd.github+json"}
if GITHUB_TOKEN:
    HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}"


In [6]:
# Cell 2: skill extraction and dork generation (very small vocabulary, extend as needed)
SKILLS_VOCAB = [
    "python","django","flask","fastapi","docker","kubernetes","aws","postgres","mongodb",
    "golang","java","node","react","graphql","rest","microservices"
]

def extract_skills_from_jd(jd_text, vocab=SKILLS_VOCAB, top_k=4):
    text = jd_text.lower()
    found = []
    for token in vocab:
        if re.search(r"\b" + re.escape(token) + r"\b", text):
            normalized = "golang" if token == "go" else token
            if normalized not in found:
                found.append(normalized)
    # fallback: pick most frequent words longer than 3 chars if none found
    if not found:
        tokens = re.findall(r"\w+", jd_text.lower())
        freq = {}
        for t in tokens:
            if len(t) > 3:
                freq[t] = freq.get(t, 0) + 1
        found = sorted(freq, key=lambda k: -freq[k])[:top_k]
    return found[:top_k]

def generate_google_dorks(skills, n=3):
    base = 'site:github.com "README"'
    dorks = []
    for i in range(min(n, max(1, len(skills)))):
        terms = skills[i:i+2] if i+2 <= len(skills) else skills[i:i+1]
        d = base + " " + " ".join(f'"{t}"' for t in terms)
        dorks.append(d)
    if len(skills) > 1 and len(dorks) < n:
        dorks.append(base + " " + " ".join(f'"{t}"' for t in skills[:3]))
    return dorks


In [7]:
# Cell 3: GitHub search (repos) and extract owners (usernames)
def github_search_repositories(query, per_page=30, page=1):
    """
    Uses GitHub Search Repositories API.
    query: plain text string (we will quote/encode it)
    returns parsed JSON response or raises requests.HTTPError
    """
    url = f"https://api.github.com/search/repositories?q={quote_plus(query)}&per_page={per_page}&page={page}"
    resp = requests.get(url, headers=HEADERS, timeout=10)
    resp.raise_for_status()
    return resp.json()

def find_candidate_usernames_from_skills(skills, per_skill_results=20, max_users=50):
    """
    For each skill, run a conservative GitHub repo search (search README/description)
    and collect repo owners (usernames). Returns a unique list of usernames and example repo urls.
    """
    found = {}
    for skill in skills:
        # conservative GitHub query: look in readme/description for the skill term
        q = f'{skill} in:readme,description "{skill}"'
        try:
            js = github_search_repositories(q, per_page=per_skill_results, page=1)
            items = js.get("items", [])
            for it in items:
                owner = it.get("owner", {}).get("login")
                html_url = it.get("html_url")
                if owner and owner not in found:
                    found[owner] = {"example_repo": html_url}
                if len(found) >= max_users:
                    break
            # polite pause to avoid rate limits
            time.sleep(0.8)
            if len(found) >= max_users:
                break
        except requests.HTTPError as e:
            print("GitHub API error (HTTP):", e)
            print("Response:", getattr(e.response, "text", None))
            break
        except requests.RequestException as e:
            print("GitHub network error:", e)
            break
    return found


In [8]:
# Cell 4: sample run
sample_jd = """
We are hiring a Senior Backend Engineer with strong experience in Python, FastAPI, Docker, and Kubernetes.
Must have experience building microservices, REST APIs, and working knowledge of AWS. Open-source contributions are a plus.
"""

# 1) extract skills
skills = extract_skills_from_jd(sample_jd)
print("Extracted skills:", skills)

# 2) generate dork queries for display (these are Google dorks -- for user display only)
dorks = generate_google_dorks(skills, n=3)
print("\nGenerated Google dork queries (for display):")
for d in dorks:
    print(" -", d)

# 3) search GitHub and collect candidate usernames
print("\nSearching GitHub (repo search) to extract candidate usernames...")
users = find_candidate_usernames_from_skills(skills, per_skill_results=15, max_users=40)

print(f"\nFound {len(users)} unique usernames (example repos):")
df = pd.DataFrame([{"username":u, "example_repo":v["example_repo"], "profile_url":f"https://github.com/{u}"} for u,v in users.items()])
display(df.head(30))


Extracted skills: ['python', 'fastapi', 'docker', 'kubernetes']

Generated Google dork queries (for display):
 - site:github.com "README" "python" "fastapi"
 - site:github.com "README" "fastapi" "docker"
 - site:github.com "README" "docker" "kubernetes"

Searching GitHub (repo search) to extract candidate usernames...

Found 40 unique usernames (example repos):


Unnamed: 0,username,example_repo,profile_url
0,codecrafters-io,https://github.com/codecrafters-io/build-your-...,https://github.com/codecrafters-io
1,vinta,https://github.com/vinta/awesome-python,https://github.com/vinta
2,TheAlgorithms,https://github.com/TheAlgorithms/Python,https://github.com/TheAlgorithms
3,python,https://github.com/python/cpython,https://github.com/python
4,geekcomputers,https://github.com/geekcomputers/Python,https://github.com/geekcomputers
5,codebasics,https://github.com/codebasics/py,https://github.com/codebasics
6,Pierian-Data,https://github.com/Pierian-Data/Complete-Pytho...,https://github.com/Pierian-Data
7,gto76,https://github.com/gto76/python-cheatsheet,https://github.com/gto76
8,jakevdp,https://github.com/jakevdp/PythonDataScienceHa...,https://github.com/jakevdp
9,Asabeneh,https://github.com/Asabeneh/30-Days-Of-Python,https://github.com/Asabeneh


In [None]:
# -------------------------
# Install dependencies (run once)
# -------------------------
# !pip install requests pandas beautifulsoup4

# -------------------------
# Imports
# -------------------------
import requests
import pandas as pd

# -------------------------
# CONFIG
# -------------------------
SERPAPI_KEY = ""  # replace with your key
GITHUB_TOKEN = ""  # replace with your token

JOB_DESCRIPTION = """
Looking for a Python developer with experience in web scraping, APIs, and data processing.
Familiarity with Git, REST APIs, and cloud deployment is a plus.
"""

# -------------------------
# 1️⃣ Generate simple GitHub Dork Queries
# -------------------------
keywords = ["Python", "web scraping", "APIs", "data processing"]
dork_queries = [f'site:github.com "{kw}" "{JOB_DESCRIPTION.split()[0]}"' for kw in keywords[:3]]

print("Generated Dork Queries:")
for i, dq in enumerate(dork_queries, 1):
    print(f"{i}. {dq}")

# -------------------------
# 2️⃣ Search Google via SerpAPI and extract usernames
# -------------------------
def search_google_profiles(query, num_results=10):
    params = {
        "engine": "google",
        "q": query,
        "api_key": SERPAPI_KEY,
        "num": num_results
    }
    response = requests.get("https://serpapi.com/search", params=params)
    data = response.json()
    profiles = []
    if "organic_results" in data:
        for result in data["organic_results"]:
            link = result.get("link")
            if link and "github.com" in link:
                parts = link.replace("https://github.com/", "").split("/")
                username = parts[0].strip()
                if username and username != "topics" and username != "search":
                    profiles.append(f"https://github.com/{username}")
    return profiles


profile_links = []
for dq in dork_queries:
    profile_links.extend(search_google_profiles(dq))

profile_links = list(set(profile_links))  # remove duplicates

print("\nExtracted GitHub User Profiles:")
for link in profile_links:
    print(link)

# -------------------------
# 3️⃣ Fetch GitHub Profile Data via API
# -------------------------
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

def fetch_github_profile(url):
    username = url.replace("https://github.com/", "")
    api_url = f"https://api.github.com/users/{username}"
    resp = requests.get(api_url, headers=headers)
    if resp.status_code == 200:
        data = resp.json()
        return {
            "Username": username,
            "Name": data.get("name"),
            "Bio": data.get("bio"),
            "Followers": data.get("followers"),
            "Public Repos": data.get("public_repos"),
            "Profile URL": url
        }
    else:
        return {"Username": username, "Profile URL": url, "Error": resp.status_code}

candidates = [fetch_github_profile(link) for link in profile_links]

# -------------------------
# 4️⃣ Display Candidates in a Table
# -------------------------
df = pd.DataFrame(candidates)
df


Generated Dork Queries:
1. site:github.com "Python" "Looking"
2. site:github.com "web scraping" "Looking"
3. site:github.com "APIs" "Looking"

Extracted GitHub User Profiles:
https://github.com/ScrapeGraphAI
https://github.com/cubiclesoft
https://github.com/trntv
https://github.com/microbiomedata
https://github.com/https:
https://github.com/chrismaddalena
https://github.com/trimble-oss
https://github.com/oxylabs
https://github.com/GkAntonius
https://github.com/vita-epfl
https://github.com/postman-open-technologies
https://github.com/KOrfanakis
https://github.com/cassidoo
https://github.com/basecamp
https://github.com/public-apis
https://github.com/miyagawa
https://github.com/w3ctag
https://github.com/Kitware
https://github.com/resources
https://github.com/je-suis-tm
https://github.com/vraulsan
https://github.com/TheWebScrapingClub
https://github.com/Andy-Pham-72
https://github.com/aaronpk
https://github.com/Looking-Glass
https://github.com/jdx
https://github.com/deschler


Unnamed: 0,Username,Name,Bio,Followers,Public Repos,Profile URL,Error
0,ScrapeGraphAI,ScrapeGraphAI,,282.0,29.0,https://github.com/ScrapeGraphAI,
1,cubiclesoft,CubicleSoft,,224.0,100.0,https://github.com/cubiclesoft,
2,trntv,Yevhen Terentiev,,222.0,51.0,https://github.com/trntv,
3,microbiomedata,National Microbiome Data Collaborative,National Microbiome Data Collaborative,55.0,82.0,https://github.com/microbiomedata,
4,https:,,,,,https://github.com/https:,404.0
5,chrismaddalena,Christopher Maddalena,,306.0,70.0,https://github.com/chrismaddalena,
6,trimble-oss,Trimble,Trimble Online Source Store,306.0,72.0,https://github.com/trimble-oss,
7,oxylabs,Oxylabs.io,Innovative Proxy Service to Gather Data at Scale,586.0,175.0,https://github.com/oxylabs,
8,GkAntonius,Gabriel Antonius,,29.0,13.0,https://github.com/GkAntonius,
9,vita-epfl,VITA lab at EPFL,Visual Intelligence for Transportation,150.0,87.0,https://github.com/vita-epfl,
