In [None]:
'''This script performs clustering and classification on user profiles using GitHub data.
 
1. Loads the dataset and fills missing values in relevant columns.
2. Applies a Power Transformer to normalize specific numeric features.
3. Uses Gaussian Mixture Models (GMM) to determine the optimal number of clusters.
4. Assigns a "quality label" to each cluster based on aggregated metrics.
5. Splits the data into training and test sets for classification.
6. Trains an XGBoost classifier to predict the quality label.
7. Saves the trained model, preprocessor, and label mappings for future use.'''


import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
import xgboost as xgb
import joblib
import shap
import matplotlib.pyplot as plt

df = pd.read_csv("/home/ashwin_jayan/EXTRACT/ML_Model/Files/train_data.csv")

cluster_quality_cols = [
    'commit_score', 'total_stars',
    'code_reviews_count', 'total_commits_last_year'
]
df[cluster_quality_cols] = df[cluster_quality_cols].fillna(0)

preprocessor = ColumnTransformer([
    ('power_transform', PowerTransformer(), [
        'public_repos', 'followers', 'total_stars', 
        'total_forks', 'commit_score'
    ]),
], remainder='passthrough')

X = df.drop(columns=['job role','username', 'email', 'user_url', 'avatar_url'])
X_trans = preprocessor.fit_transform(X)

def find_optimal_clusters(X, max_clusters=8):
    bic = []
    for n in range(1, max_clusters+1):
        gmm = GaussianMixture(n_components=n, random_state=42)
        gmm.fit(X)
        bic.append(gmm.bic(X))
    return np.argmin(bic) + 1

optimal_clusters = find_optimal_clusters(X_trans)
gmm = GaussianMixture(n_components=optimal_clusters, random_state=42)
clusters = gmm.fit_predict(X_trans)
df['cluster'] = clusters

cluster_quality = df.groupby('cluster').agg({
    'commit_score': 'mean',
    'total_stars': 'mean',
    'code_reviews_count': 'mean',
    'total_commits_last_year': 'median'
}).mean(axis=1)


quality_labels = pd.qcut(cluster_quality, q=[0, 0.2, 0.8, 1], 
                        labels=[0, 1, 2])  
label_mapping = {
    0: 'bad',
    1: 'good', 
    2: 'exceptional'
}
df['quality_label'] = df['cluster'].map(quality_labels.to_dict())

X_train, X_test, y_train, y_test = train_test_split(
    X_trans, df['quality_label'].astype(int),  
    test_size=0.2, stratify=df['quality_label'], 
    random_state=42
)

clf = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    eval_metric='mlogloss',
    subsample=0.8,
    colsample_bytree=0.9,
    use_label_encoder=False,
    random_state=42
)
clf.fit(X_train, y_train)

joblib.dump(label_mapping, '/home/ashwin_jayan/EXTRACT/ML_Model/Files/Pickle/label_mapping.pkl')

print("Classifier Validation Report:")
print("Test Accuracy:", clf.score(X_test, y_test))
print("Class Distribution:\n", y_test.value_counts(normalize=True))

joblib.dump(preprocessor, '/home/ashwin_jayan/EXTRACT/ML_Model/Files/Pickle/preprocessor.pkl')
joblib.dump(clf, '/home/ashwin_jayan/EXTRACT/ML_Model/Files/Pickle/xgb_classifier.pkl')
joblib.dump(quality_labels, '/home/ashwin_jayan/EXTRACT/ML_Model/Files/Pickle/cluster_labels.pkl')

Classifier Validation Report:
Test Accuracy: 0.9708029197080292
Class Distribution:
 quality_label
0    0.532847
2    0.255474
1    0.211679
Name: proportion, dtype: float64


Parameters: { "use_label_encoder" } are not used.



['/home/ashwin_jayan/EXTRACT/ML_Model/Files/Pickle/cluster_labels.pkl']