In [None]:
!pip install requests pandas matplotlib seaborn


## Extracting User Information

### Profile Information

In [None]:
#Profile Information

import requests

def get_profile_info(username, token):
    headers = {'Authorization': f'token {token}'}
    url = f'https://api.github.com/users/{username}'
    response = requests.get(url, headers=headers)
    profile_data = response.json()
    return {
        'profile_name': profile_data['name'],
        'profile_id': profile_data['id'],
        'followers': profile_data['followers']
    }
#emilk
# Example usage
token = 'put_own_token_here'
username = 'officialamit558'
profile_info = get_profile_info(username, token)
print(profile_info)


{'profile_name': 'Amit Kumar', 'profile_id': 138309651, 'followers': 1}


### Repository Information and most tech stack used

In [None]:

# Repository Information and most tech stack used


import requests
from collections import Counter
import base64
import json
from xml.etree import ElementTree as ET

def fetch_file_content(url, headers):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        file_info = response.json()
        if 'content' in file_info:
            return base64.b64decode(file_info['content']).decode('utf-8')
    return None

def extract_dependencies(repo, headers):
    dependencies = []
    repo_name = repo['repo_name']
    owner = repo['owner']['login']

    # Check for package.json (JavaScript/Node.js)
    package_json_content = fetch_file_content(f'https://api.github.com/repos/{owner}/{repo_name}/contents/package.json', headers)
    if package_json_content:
        package_data = json.loads(package_json_content)
        if 'dependencies' in package_data:
            dependencies.extend(package_data['dependencies'].keys())
        if 'devDependencies' in package_data:
            dependencies.extend(package_data['devDependencies'].keys())

    # Check for requirements.txt (Python)
    requirements_content = fetch_file_content(f'https://api.github.com/repos/{owner}/{repo_name}/contents/requirements.txt', headers)
    if requirements_content:
        dependencies.extend(requirements_content.splitlines())

    # Check for Gemfile (Ruby)
    gemfile_content = fetch_file_content(f'https://api.github.com/repos/{owner}/{repo_name}/contents/Gemfile', headers)
    if gemfile_content:
        for line in gemfile_content.splitlines():
            if line.startswith('gem '):
                gem_name = line.split()[1].strip("'")
                dependencies.append(gem_name)

    # Check for pom.xml (Java)
    pom_xml_content = fetch_file_content(f'https://api.github.com/repos/{owner}/{repo_name}/contents/pom.xml', headers)
    if pom_xml_content:
        root = ET.fromstring(pom_xml_content)
        for dependency in root.findall('.//dependency'):
            group_id = dependency.find('groupId').text if dependency.find('groupId') is not None else ''
            artifact_id = dependency.find('artifactId').text if dependency.find('artifactId') is not None else ''
            dependencies.append(f'{group_id}:{artifact_id}')

    # Check for composer.json (PHP)
    composer_json_content = fetch_file_content(f'https://api.github.com/repos/{owner}/{repo_name}/contents/composer.json', headers)
    if composer_json_content:
        composer_data = json.loads(composer_json_content)
        if 'require' in composer_data:
            dependencies.extend(composer_data['require'].keys())

    # Check for go.mod (Go)
    go_mod_content = fetch_file_content(f'https://api.github.com/repos/{owner}/{repo_name}/contents/go.mod', headers)
    if go_mod_content:
        for line in go_mod_content.splitlines():
            if line.startswith('require'):
                dependency = line.split()[1]
                dependencies.append(dependency)

    return dependencies

def analyze_tech_stack(repos_info, token):
    headers = {'Authorization': f'token {token}'}
    tech_stack = []

    for repo in repos_info:
        # Primary language
        if repo['language']:
            tech_stack.append(repo['language'])

        # Dependencies from configuration files
        dependencies = extract_dependencies(repo, headers)
        tech_stack.extend(dependencies)

    tech_counter = Counter(tech_stack)
    most_common_tech = tech_counter.most_common(1)[0][0]

    # Sort tech stack by count in descending order
    sorted_tech_stack = tech_counter.most_common()

    return most_common_tech, sorted_tech_stack

def get_repositories_info(username, token):
    headers = {'Authorization': f'token {token}'}
    url = f'https://api.github.com/users/{username}/repos'
    response = requests.get(url, headers=headers)
    repos_data = response.json()
    repos_info = []
    for repo in repos_data:
        repos_info.append({
            'repo_name': repo['name'],
            'language': repo['language'],
            'owner': repo['owner']  # Added owner info for dependency extraction
        })
    return repos_info

# Example usage
repos_info = get_repositories_info(username, token)
most_common_tech, sorted_tech_stack = analyze_tech_stack(repos_info, token)

for tech , count in sorted_tech_stack[:5]:
    print(tech)

print(f"Most common tech: {most_common_tech}")
print("Full tech stack analysis in descending order:")
for tech, count in sorted_tech_stack:
    print(f"{tech}: {count}")

for x in repos_info:
    print(x)

Jupyter Notebook
Python
streamlit
numpy
Flask==1.1.1
Most common tech: Jupyter Notebook
Full tech stack analysis in descending order:
Jupyter Notebook: 13
Python: 8
streamlit: 3
numpy: 3
Flask==1.1.1: 2
gunicorn==19.9.0: 2
itsdangerous==1.1.0: 2
Jinja2==2.10.1: 2
MarkupSafe==1.1.1: 2
Werkzeug==0.15.5: 2
numpy>=1.9.2: 2
scipy>=0.15.1: 2
matplotlib>=1.4.3: 2
pandas>=0.19: 2
pandas: 2
matplotlib: 2
python-docx : 1
PyPDF2: 1
pdfplumber: 1
HTML: 1
scikit-learn== 1.2.2: 1
flasgger==0.9.4: 1
EJS: 1
bcrypt: 1
dotenv: 1
ejs: 1
express: 1
express-session: 1
mongoose: 1
passport: 1
passport-google-oauth20: 1
passport-local: 1
passport-local-mongoose: 1
setuptools==57.5.0: 1
pip==21.1.2: 1
scikit-learn>=0.18: 1
distutils: 1
Flask: 1
gunicorn: 1
seaborn: 1
sklearn: 1
fastapi==0.111.0: 1
gensim==4.3.2: 1
joblib==1.4.2: 1
numpy==1.26.4: 1
pandas==2.2.2: 1
pydantic==2.7.2: 1
scikit_learn==1.5.0: 1
tensorflow: 1
cvzone: 1
ultralytics: 1
hydra-core: 1
opencv-python: 1
Pillow: 1
PyYAML: 1
requests: 1
sci

In [None]:
top_5_tech_stacks = [tech for tech, count in sorted_tech_stack[:5]]
print(top_5_tech_stacks)

['Jupyter Notebook', 'Python', 'streamlit', 'numpy', 'Flask==1.1.1']


### Analyzing and Recommending company

In [None]:
def recommend_companies(top_tech_stacks):
    tech_to_company = {
        'Jupyter Notebook': {
            'Data Scientist': ['Google', 'Dropbox', 'IBM', 'Coursera'],
            'Software Engineer': ['Instagram', 'IBM']
        },
        'JavaScript': {
            'Frontend Developer': ['Facebook', 'Netflix', 'Airbnb'],
            'Full Stack Developer': ['Google', 'Microsoft', 'Amazon'],
            'Backend Developer': ['Microsoft', 'Amazon']
        },
        'Java': {
            'Software Engineer': ['Amazon', 'LinkedIn', 'Uber', 'Oracle', 'IBM', 'Salesforce']
        },
        'C++': {
            'Software Engineer': ['Microsoft', 'Adobe', 'NVIDIA', 'Qualcomm', 'Apple', 'Intel']
        },
        'Python': {
            'Data Scientist': ['Google', 'Spotify', 'Quora', 'Reddit', 'Dropbox'],
            'Software Engineer': ['Pinterest']
        },
        'Ruby': {
            'Software Engineer': ['Shopify', 'Stripe', 'GitHub', 'Twitch', 'Basecamp']
        },
        'PHP': {
            'Software Engineer': ['Facebook', 'WordPress', 'Wikipedia', 'Flickr']
        },
        'Swift': {
            'iOS Developer': ['Apple', '9GAG', 'Spotify'],
            'Software Engineer': ['LinkedIn', 'Slack']
        },
        'Go': {
            'Software Engineer': ['Google', 'Twitch', 'Uber', 'Dropbox', 'SoundCloud']
        },
        'Rust': {
            'Software Engineer': ['Mozilla', 'Dropbox', 'Cloudflare', 'Discord']
        },
        'TypeScript': {
            'Software Engineer': ['Microsoft', 'Slack', 'Airbnb', 'Asana', 'Uber']
        },
        'Kotlin': {
            'Software Engineer': ['Google', 'Pinterest', 'Square', 'Trello', 'Corda']
        }
        # Add more mappings as needed
    }

    all_recommendations = {}
    for tech in top_tech_stacks:
        companies = tech_to_company.get(tech, ['No recommendations available'])
        all_recommendations[tech] = companies

    return all_recommendations

# Example usage
top_5_tech_stacks = [tech for tech, count in sorted_tech_stack[:5]]
company_recommendations = recommend_companies(top_5_tech_stacks)


print("Company recommendations based on top 5 tech stacks:")
for tech, companies in company_recommendations.items():
    print(f"\nTech: {tech} \nCompany: {companies}")


Company recommendations based on top 5 tech stacks:

Tech: Jupyter Notebook 
Company: {'Data Scientist': ['Google', 'Dropbox', 'IBM', 'Coursera'], 'Software Engineer': ['Instagram', 'IBM']}

Tech: Python 
Company: {'Data Scientist': ['Google', 'Spotify', 'Quora', 'Reddit', 'Dropbox'], 'Software Engineer': ['Pinterest']}

Tech: streamlit 
Company: ['No recommendations available']

Tech: numpy 
Company: ['No recommendations available']

Tech: Flask==1.1.1 
Company: ['No recommendations available']


In [None]:
import requests

def get_company_recommendations_from_gemini(tech_stack, api_key):
    # Hypothetical correct endpoint
    url = "https://api.gemini.com/v1/techstack/recommendations"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "tech_stack": tech_stack
    }
    response = requests.post(url, headers=headers, json=payload)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return {"result": "error", "reason": response.status_code, "message": response.text}

# Example usage
api_key = "your_actual_gemini_api_key"
sorted_tech_stack = [('Python', 50), ('JavaScript', 40), ('Java', 30), ('C++', 20), ('Ruby', 10)]  # Example data
tech_stack = [tech for tech, count in sorted_tech_stack[:5]]
recommendations = get_company_recommendations_from_gemini(tech_stack, api_key)
print(recommendations)


Error: 404, {"result":"error","reason":"EndpointNotFound","message":"API entry point `/v1/techstack/recommendations` not found"}
{'result': 'error', 'reason': 404, 'message': '{"result":"error","reason":"EndpointNotFound","message":"API entry point `/v1/techstack/recommendations` not found"}'}


In [None]:
top_5_tech_stacks = [tech for tech, count in sorted_tech_stack[:5]]
print(top_5_tech_stacks)

['Jupyter Notebook', 'Python', 'streamlit', 'numpy', 'Flask==1.1.1']


### Daily Contributions
```



In [None]:
# Daily Contributions


import requests
from datetime import datetime, timedelta

def get_daily_contributions(username, token):
    headers = {
        'Authorization': f'token {token}'
    }
    url = f'https://api.github.com/users/{username}/events'
    response = requests.get(url, headers=headers)
    events = response.json()

    contributions = {}
    today = datetime.now().strftime('%Y-%m-%d')
    yesterday = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')

    for event in events:
        if event['type'] == 'PushEvent':
            event_date = event['created_at'][:10]  # Extracting only date part
            if event_date == today or event_date == yesterday:
                contributions[event_date] = contributions.get(event_date, 0) + 1

    return contributions

def maintain_streak(contributions):
    today = datetime.now().strftime('%Y-%m-%d')
    yesterday = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')

    if today in contributions and yesterday in contributions:
        return True  # Streak maintained
    else:
        return False  # Streak broken

# Example usage
contributions = get_daily_contributions(username, token)
print("Daily contributions:", contributions)

streak_maintained = maintain_streak(contributions)
print("Streak maintained:", streak_maintained)


Daily contributions: {'2024-05-29': 3}
Streak maintained: False


###  Profile Improvement Tips

### Profile Improvement Tips

In [None]:

#Profile Improvement Tips

import requests
from datetime import datetime

def suggest_profile_improvements(username, token):
    headers = {'Authorization': f'token {token}'}
    url = f'https://api.github.com/users/{username}'
    response = requests.get(url, headers=headers)
    profile_data = response.json()

    tips = []

    # Check for profile picture
    if not profile_data.get('avatar_url'):
        tips.append('Add a professional profile picture.')

    # Check for bio
    if not profile_data.get('bio'):
        tips.append('Add a professional bio in the overview section.')

    # Check for blog or portfolio link
    if not profile_data.get('blog'):
        tips.append('Include links to personal projects and portfolio in your profile.')

    # Fetch repositories
    repos_url = f'https://api.github.com/users/{username}/repos'
    repos_response = requests.get(repos_url, headers=headers)
    repos_data = repos_response.json()

    # Check for repository updates
    recent_update = False
    now = datetime.now()
    for repo in repos_data:
        last_updated = datetime.strptime(repo['updated_at'], '%Y-%m-%dT%H:%M:%SZ')
        if (now - last_updated).days < 30:
            recent_update = True
            break
    if not recent_update:
        tips.append('Regularly update repositories with recent work.')

    # Check for commit messages
    commit_messages_descriptive = True
    for repo in repos_data:
        commits_url = f'https://api.github.com/repos/{username}/{repo["name"]}/commits'
        commits_response = requests.get(commits_url, headers=headers)
        commits_data = commits_response.json()
        for commit in commits_data:
            message = commit['commit']['message']
            if len(message.split()) < 3:  # simplistic check for descriptive messages
                commit_messages_descriptive = False
                break
        if not commit_messages_descriptive:
            break
    if not commit_messages_descriptive:
        tips.append('Use descriptive commit messages and maintain a consistent commit history.')

    # Check for open-source contributions
    events_url = f'https://api.github.com/users/{username}/events'
    events_response = requests.get(events_url, headers=headers)
    events_data = events_response.json()

    contributed_to_open_source = any(event['type'] == 'PullRequestEvent' for event in events_data)
    if not contributed_to_open_source:
        tips.append('Participate in open-source projects and contribute to community discussions.')

    # Check for pinned repositories
    pinned_repos_url = f'https://api.github.com/users/{username}/starred'
    pinned_repos_response = requests.get(pinned_repos_url, headers=headers)
    pinned_repos_data = pinned_repos_response.json()
    if not pinned_repos_data:
        tips.append('Pin key projects to showcase your skills at the top of your profile.')

    # Check for README files
    readme_missing = False
    for repo in repos_data:
        readme_url = f'https://api.github.com/repos/{username}/{repo["name"]}/readme'
        readme_response = requests.get(readme_url, headers=headers)
        if readme_response.status_code != 200:
            readme_missing = True
            break
    if readme_missing:
        tips.append('Ensure all repositories have well-documented README files.')

    # Check for educational information
    if not profile_data.get('company') and not profile_data.get('location'):
        tips.append('Add your educational background and relevant certifications to your profile.')

    # Check for stars and forks on repositories
    popular_repo = False
    for repo in repos_data:
        if repo['stargazers_count'] > 50 or repo['forks_count'] > 6:
            popular_repo = True
            break
    if not popular_repo:
        tips.append('Highlight your most popular repositories that have significant stars or forks.')

    # Check for repository topics
    missing_topics = False
    for repo in repos_data:
        topics_url = f'https://api.github.com/repos/{username}/{repo["name"]}/topics'
        topics_response = requests.get(topics_url, headers=headers)
        topics_data = topics_response.json()
        if not topics_data.get('names'):
            missing_topics = True
            break
    if missing_topics:
        tips.append('Add relevant topics to your repositories for better discoverability.')

    # Check for project descriptions
    missing_descriptions = False
    for repo in repos_data:
        if not repo['description']:
            missing_descriptions = True
            break
    if missing_descriptions:
        tips.append('Add detailed descriptions for each project to explain their purpose and functionality.')

    # Check for license information
    missing_license = False
    for repo in repos_data:
        license_url = f'https://api.github.com/repos/{username}/{repo["name"]}/license'
        license_response = requests.get(license_url, headers=headers)
        if license_response.status_code != 200:
            missing_license = True
            break
    if missing_license:
        tips.append('Ensure your repositories have appropriate licensing information.')

    # Check for coding challenges participation
    # This is a placeholder as we can't directly check external sites from GitHub API
    # You can integrate with other APIs such as LeetCode, HackerRank if they provide such functionality
    tips.append('Participate in coding challenges on platforms like LeetCode, HackerRank, etc. and add links to your profiles.')

    return tips

# Example usage
improvement_tips = suggest_profile_improvements(username, token)
for i in improvement_tips:
    print(i)


Add a professional bio in the overview section.
Use descriptive commit messages and maintain a consistent commit history.
Participate in open-source projects and contribute to community discussions.
Ensure all repositories have well-documented README files.
Highlight your most popular repositories that have significant stars or forks.
Add relevant topics to your repositories for better discoverability.
Add detailed descriptions for each project to explain their purpose and functionality.
Ensure your repositories have appropriate licensing information.
Participate in coding challenges on platforms like LeetCode, HackerRank, etc. and add links to your profiles.


## Rating

In [None]:
# Rating

import requests
from datetime import datetime, timedelta
from bs4 import BeautifulSoup

def get_daily_contributions(username, token):
    headers = {
        'Authorization': f'token {token}'
    }
    url = f'https://api.github.com/users/{username}/events'
    response = requests.get(url, headers=headers)
    events = response.json()

    contributions = {}
    today = datetime.now().strftime('%Y-%m-%d')
    yesterday = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')

    for event in events:
        if event['type'] == 'PushEvent':
            event_date = event['created_at'][:10]  # Extracting only date part
            if event_date == today or event_date == yesterday:
                contributions[event_date] = contributions.get(event_date, 0) + 1

    return contributions

def maintain_streak(contributions):
    today = datetime.now().strftime('%Y-%m-%d')
    yesterday = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')

    if today in contributions and yesterday in contributions:
        return True  # Streak maintained
    else:
        return False  # Streak broken

def get_repositories_info(username, token):
    headers = {'Authorization': f'token {token}'}
    url = f'https://api.github.com/users/{username}/repos'
    response = requests.get(url, headers=headers)
    repos_data = response.json()
    repos_info = []
    for repo in repos_data:
        repos_info.append({
            'repo_name': repo['name'],
            'language': repo['language'],
            'description': repo.get('description', 'No description available'),
            'tech_tools': repo.get('topics', []),
            'stargazers_count': repo['stargazers_count'],
            'forks_count': repo['forks_count']
        })
    return repos_info

def analyze_tech_stack(repos_info):
    from collections import Counter
    tech_stack = [repo['language'] for repo in repos_info if repo['language']]
    tech_counter = Counter(tech_stack)
    most_common_tech = tech_counter.most_common(1)[0][0]
    sorted_tech_stack = tech_counter.most_common()
    return most_common_tech, sorted_tech_stack

def rate_github_profile(username, token):
    headers = {'Authorization': f'token {token}'}
    url = f'https://api.github.com/users/{username}'
    response = requests.get(url, headers=headers)
    profile_data = response.json()

    followers = profile_data['followers']
    public_repos = profile_data['public_repos']

    repos_info = get_repositories_info(username, token)

    total_stars = sum(repo['stargazers_count'] for repo in repos_info)
    total_forks = sum(repo['forks_count'] for repo in repos_info)

    contributions = get_daily_contributions(username, token)
    streak_maintained = maintain_streak(contributions)
     # Normalize each metric to a 0-1 scale before combining
    followers_score = min(followers / 1000, 1.0) * 4  # Max 4 points for followers
    public_repos_score = min(public_repos / 100, 1.0) * 3  # Max 3 points for public repos
    stars_score = min(total_stars / 500, 1.0) * 2  # Max 2 points for stars
    forks_score = min(total_forks / 200, 1.0) * 1  # Max 1 point for forks
    streak_score = 1 if streak_maintained else 0  # Max 1 point for streak

    rating = followers_score + public_repos_score + stars_score + forks_score + streak_score

    return round(min(rating, 10), 2)

# Example usage
# username = 'octocat'
# token = 'your_github_token'
contributions = get_daily_contributions(username, token)
print("Daily contributions:", contributions)

streak_maintained = maintain_streak(contributions)
print("Streak maintained:", streak_maintained)

repos_info = get_repositories_info(username, token)
most_common_tech, sorted_tech_stack = analyze_tech_stack(repos_info)
top_5_tech_stacks = [tech for tech, count in sorted_tech_stack[:5]]


rating = rate_github_profile(username, token)
print(f"GitHub Profile Rating: {rating}/10")


Daily contributions: {'2024-05-29': 3}
Streak maintained: False
GitHub Profile Rating: 1.12/10


### All Output

In [None]:
def generate_recommendations(profile_info, most_common_tech, company_recommendations, improvement_tips):
    report = f"""
    GitHub Profile Analysis Report:
    ------------------------------
    Profile Name: {profile_info['profile_name']}
    Profile ID: {profile_info['profile_id']}
    Followers: {profile_info['followers']}

    Predominant Tech Stack: {most_common_tech}

    Recommended Companies:
    {', '.join(company_recommendations)}

    Profile Improvement Tips:
    {', '.join(improvement_tips)}
    """
    return report

# Example usage
recommendations_report = generate_recommendations(
    profile_info, most_common_tech, company_recommendations, improvement_tips
)
print(recommendations_report)



    GitHub Profile Analysis Report:
    ------------------------------
    Profile Name: Amit Kumar
    Profile ID: 138309651
    Followers: 1
    
    Predominant Tech Stack: Jupyter Notebook
    
    Recommended Companies:
    Jupyter Notebook, Python, streamlit, numpy, Flask==1.1.1
    
    Profile Improvement Tips:
    Add a professional bio in the overview section., Use descriptive commit messages and maintain a consistent commit history., Participate in open-source projects and contribute to community discussions., Ensure all repositories have well-documented README files., Highlight your most popular repositories that have significant stars or forks., Add relevant topics to your repositories for better discoverability., Add detailed descriptions for each project to explain their purpose and functionality., Ensure your repositories have appropriate licensing information., Participate in coding challenges on platforms like LeetCode, HackerRank, etc. and add links to your profiles

# Data collection from the GitHub

In [None]:
import requests

def get_github_usernames(token, since_id=0, per_page=100):
    headers = {'Authorization': f'token {token}'}
    url = f'https://api.github.com/users?since={since_id}&per_page={per_page}'
    response = requests.get(url, headers=headers)
    users = response.json()
    return [user['login'] for user in users], users[-1]['id'] if users else None

# Example usage
# token = 'your_github_token'
usernames, last_id = get_github_usernames(token)
print(usernames)


['mojombo', 'defunkt', 'pjhyett', 'wycats', 'ezmobius', 'ivey', 'evanphx', 'vanpelt', 'wayneeseguin', 'brynary', 'kevinclark', 'technoweenie', 'macournoyer', 'takeo', 'caged', 'topfunky', 'anotherjesse', 'roland', 'lukas', 'fanvsfan', 'tomtt', 'railsjitsu', 'nitay', 'kevwil', 'KirinDave', 'jamesgolick', 'atmos', 'errfree', 'mojodna', 'bmizerany', 'jnewland', 'joshknowles', 'hornbeck', 'jwhitmire', 'elbowdonkey', 'reinh', 'knzconnor', 'bs', 'rsanheim', 'schacon', 'uggedal', 'bruce', 'sam', 'mmower', 'abhay', 'rabble', 'benburkert', 'indirect', 'fearoffish', 'ry', 'engineyard', 'jsierles', 'tweibley', 'peimei', 'brixen', 'tmornini', 'outerim', 'daksis', 'sr', 'lifo', 'rsl', 'imownbey', 'dylanegan', 'jm', 'kmarsh', 'jvantuyl', 'BrianTheCoder', 'freeformz', 'hassox', 'automatthew', 'queso', 'lancecarlson', 'drnic', 'lukesutton', 'danwrong', 'HamptonMakes', 'jfrost', 'mattetti', 'ctennis', 'lawrencepit', 'marcjeanson', 'grempe', 'peterc', 'ministrycentered', 'afarnham', 'up_the_irons', 'cri

In [None]:
import requests
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
import time

In [None]:


def get_github_usernames(token, since_id=0, per_page=100):
    headers = {'Authorization': f'token {token}'}
    url = f'https://api.github.com/users?since={since_id}&per_page={per_page}'
    while True:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            users = response.json()
            return [user['login'] for user in users], users[-1]['id'] if users else None
        elif response.status_code == 403:
            print("Rate limit exceeded. Sleeping for 60 seconds...")
            time.sleep(60)
        else:
            response.raise_for_status()

def get_profile_info(username, token):
    headers = {'Authorization': f'token {token}'}
    url = f'https://api.github.com/users/{username}'
    response = requests.get(url, headers=headers)
    return response.json()

def get_repositories_info(username, token):
    headers = {'Authorization': f'token {token}'}
    url = f'https://api.github.com/users/{username}/repos'
    response = requests.get(url, headers=headers)
    repos_data = response.json()
    repos_info = []
    for repo in repos_data:
        repos_info.append({
            'repo_name': repo['name'],
            'language': repo['language'],
            'description': repo.get('description', 'No description available'),
            'tech_tools': repo.get('topics', []),
            'stargazers_count': repo['stargazers_count'],
            'forks_count': repo['forks_count']
        })
    return repos_info

def get_daily_contributions(username, token):
    headers = {'Authorization': f'token {token}'}
    url = f'https://api.github.com/users/{username}/events'
    response = requests.get(url, headers=headers)
    events = response.json()

    contributions = {}
    today = datetime.now().strftime('%Y-%m-%d')
    yesterday = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')

    for event in events:
        if event['type'] == 'PushEvent':
            event_date = event['created_at'][:10]  # Extracting only date part
            if event_date == today or event_date == yesterday:
                contributions[event_date] = contributions.get(event_date, 0) + 1

    return contributions

def maintain_streak(contributions):
    today = datetime.now().strftime('%Y-%m-%d')
    yesterday = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')

    if today in contributions and yesterday in contributions:
        return 1  # Streak maintained
    else:
        return 0  # Streak broken

def rate_github_profile(username, token):
    headers = {'Authorization': f'token {token}'}
    url = f'https://api.github.com/users/{username}'
    response = requests.get(url, headers=headers)
    profile_data = response.json()

    followers = profile_data['followers']
    public_repos = profile_data['public_repos']

    repos_info = get_repositories_info(username, token)

    total_stars = sum(repo['stargazers_count'] for repo in repos_info)
    total_forks = sum(repo['forks_count'] for repo in repos_info)

    contributions = get_daily_contributions(username, token)
    streak_maintained = maintain_streak(contributions)

    # Normalize each metric to a 0-1 scale before combining
    followers_score = min(followers / 1000, 1.0) * 4  # Max 4 points for followers
    public_repos_score = min(public_repos / 100, 1.0) * 3  # Max 3 points for public repos
    stars_score = min(total_stars / 500, 1.0) * 2  # Max 2 points for stars
    forks_score = min(total_forks / 200, 1.0) * 1  # Max 1 point for forks
    streak_score = 1 if streak_maintained else 0  # Max 1 point for streak

    rating = followers_score + public_repos_score + stars_score + forks_score + streak_score

    return round(min(rating, 10), 2)

def collect_data_for_user(username, token):
    try:
        profile_info = get_profile_info(username, token)
        repos_info = get_repositories_info(username, token)
        contributions = get_daily_contributions(username, token)
        streak_maintained = maintain_streak(contributions)
        rating = rate_github_profile(username, token)

        return {
            'profile_name': profile_info['login'],
            'followers': profile_info['followers'],
            'public_repos': profile_info['public_repos'],
            'total_stars': sum(repo['stargazers_count'] for repo in repos_info),
            'total_forks': sum(repo['forks_count'] for repo in repos_info),
            'contributions_today': contributions.get(datetime.now().strftime('%Y-%m-%d'), 0),
            'streak_maintained': streak_maintained,
            'rating': rating
        }
    except Exception as e:
        print(f"Error collecting data for {username}: {e}")
        return None

def collect_data(usernames, token):
    data = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(collect_data_for_user, username, token) for username in usernames]
        for future in as_completed(futures):
            result = future.result()
            if result:
                data.append(result)
    return pd.DataFrame(data)

# Collect usernames
usernames = []
since_id = 0
while len(usernames) < 10000:
    new_usernames, since_id = get_github_usernames(token, since_id)
    usernames.extend(new_usernames)
    if not new_usernames:
        break
print(f"Collected {len(usernames)} usernames")

# Collect data for collected usernames
data = collect_data(usernames, token)
print(data.head())




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Error collecting data for kasumix: string indices must be integers
Error collecting data for dsklyut: string indices must be integers
Error collecting data for jnovak: string indices must be integers
Error collecting data for gakaki: string indices must be integers
Error collecting data for aran: string indices must be integers
Error collecting data for xitrium: string indices must be integers
Error collecting data for patk-zz: string indices must be integers
Error collecting data for dgolub: string indices must be integers
Error collecting data for winton: string indices must be integers
Error collecting data for zsolt: string indices must be integers
Error collecting data for morizo: string indices must be integers
Error collecting data for bkaney: string indices must be integers
Error collecting data for PiTiLeZarD: string indices must be integers
Error collecting data for ooodigi: string indices must be integers
Error

In [None]:
rating = followers_score + public_repos_score + stars_score + forks_score + streak_score

In [None]:
data.head(4)

Unnamed: 0,profile_name,followers,public_repos,total_stars,total_forks,contributions_today,streak_maintained,rating
0,ezmobius,554,22,1574,193,0,0,5.84
1,pjhyett,8273,8,606,147,0,0,6.98
2,mojombo,23884,66,10265,2238,0,0,8.98
3,defunkt,22281,107,8664,1504,0,0,10.0


In [None]:
# Prepare data for model training
X = data.drop(columns = ['rating' , 'profile_name','contributions_today'], axis=1)
y = data['rating']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0834, random_state=42)

# Train model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Evaluate model
predictions = model.predict(X_test)
print('MSE:', mean_squared_error(y_test, predictions))

# Save the model
joblib.dump(model, 'github_profile_rating_model.pkl')

MSE: 0.19301618843750085


['github_profile_rating_model.pkl']

In [None]:
model = joblib.load('github_profile_rating_model.pkl')

In [None]:
pred = model.predict([[8273,	8	,606,	147,	0	]])




In [None]:
pred # In reality 5.86

array([7.2348])