In [None]:
''' This script identifies and ranks exceptional users from GitHub profile data using a trained classification model.

1. Loads the pre-trained preprocessor, XGBoost classifier, and label mappings.
2. Reads and combines user profile data from multiple CSV files.
3. Removes duplicate entries based on the 'username' column.
4. Ensures all required feature columns are present in the dataset.
5. Applies the preprocessor to transform numeric features.
6. Predicts user quality labels using the trained classifier.
7. Assigns a quality ranking to each user based on predefined categories.
8. Computes a custom score based on relevant GitHub activity metrics.
9. Selects the top 30 candidates per job role based on quality and score.
10. Saves the final ranked list of candidates to a CSV file. '''


import pandas as pd
import joblib
import os

BASE_DIR = "/home/ashwin_jayan/EXTRACT/ML_Model/Files"

def predict_exceptional_users():
    preprocessor = joblib.load(os.path.join(BASE_DIR, 'Pickle/preprocessor.pkl'))
    clf = joblib.load(os.path.join(BASE_DIR, 'Pickle/xgb_classifier.pkl'))
    label_mapping = joblib.load(os.path.join(BASE_DIR, 'Pickle/label_mapping.pkl'))
    
    input_files = [
        os.path.join(BASE_DIR, "data_science_data.csv"),
        os.path.join(BASE_DIR, "java_developer_data.csv"),
        os.path.join(BASE_DIR, "web_developer_data.csv")
    ]
    
    new_data = pd.concat([pd.read_csv(f) for f in input_files], ignore_index=True)
    
    unique_identifier = 'username'  
    new_data = new_data.drop_duplicates(subset=[unique_identifier])

    required_columns = [
        'public_repos', 'followers', 'total_stars',
        'total_forks', 'total_issues_opened', 'total_issues_closed',
        'total_commits_last_year', 'total_commits_all_time',
        'avg_issue_close_time', 'contributed_repos', 'code_reviews_count',
        'commit_score', 'feature_1', 'feature_2', 'feature_3',
        'total_pr_merged', 'avg_commits_per_month', 'job role'
    ]
    
    missing = set(required_columns) - set(new_data.columns)
    if missing:
        raise ValueError(f"Missing columns in input data: {missing}")

    X_new = preprocessor.transform(new_data[required_columns[:-1]])  
    pred_labels = clf.predict(X_new)
    
    new_data['predicted_quality'] = [label_mapping[l] for l in pred_labels]
    
    quality_order = {'exceptional': 0, 'good': 1, 'bad': 2}
    new_data['quality_rank'] = new_data['predicted_quality'].map(quality_order)
    new_data['score'] = (
        new_data['followers'] * 30. +
        new_data['total_stars'] * 2 +
        new_data['commit_score'] * 3 +
        new_data['code_reviews_count'] * 1 +
        new_data['total_pr_merged'] * 2
    ) / 9 
    
    best_candidates = (
        new_data.sort_values(by=['quality_rank', 'score'], ascending=[True, False])
        .groupby('job role')
        .head(30)  
    )

    output_path = os.path.join(BASE_DIR, "best_candidates_per_role.csv")
    best_candidates.to_csv(output_path, index=False)
    
    print(f"Saved {len(best_candidates)} candidates to {output_path}")
    print("Job roles in output:", best_candidates['job role'].unique())

predict_exceptional_users()


Saved 90 candidates to /home/ashwin_jayan/EXTRACT/ML_Model/Files/best_candidates_per_role.csv
Job roles in output: ['Web Developer' 'Data Science' 'Java Developer']
