In [1]:
import sys
import os

SCRIPT_DIR = os.path.dirname(os.path.abspath('src'))
sys.path.append(os.path.dirname(SCRIPT_DIR))


In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from src.utils import MlflowUtils
import sklearn.metrics as skm
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline


In [3]:
LOAD_PATH = '../data/interim/'
LOAD_SKILLS_DEV = '7.0-Chosen_features_and_roles.pkl'


# Load Data

In [4]:
skills_dev_df = pd.read_pickle(LOAD_PATH + LOAD_SKILLS_DEV)

In [5]:
skills_dev_df

Unnamed: 0_level_0,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,...,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType
Unnamed: 0_level_1,APL,Assembly,Bash/Shell,C,C#,C++,COBOL,Clojure,Crystal,Dart,...,full_stack_Java,full_stack_.JavaScript,full_stack_PHP,full_stack_python,back_end_Java,back_end_.JavaScript,back_end_.NET,back_end_C++,back_end_python,back_end_PHP
2,0.0,0.0,0.0,0.0,0.75,0.75,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,1.50,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,0.5,0.0,0.5,1.0,0.00,1.00,0.0,0.0,0.0,0.0,...,1,1,1,0,0,0,0,0,0,0
9,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
10,0.0,0.0,1.0,0.0,1.50,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73262,0.0,0.5,0.0,0.5,0.00,0.75,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
73263,0.0,0.0,1.5,0.0,0.00,0.00,0.0,0.0,0.0,1.0,...,0,0,0,0,0,1,0,1,1,1
73264,0.0,0.0,1.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
73265,0.0,0.0,0.0,0.0,0.25,0.00,0.0,0.0,0.0,0.0,...,0,0,1,1,0,0,0,0,0,0


In [6]:
roles_df = skills_dev_df['DevType']
roles_df.sum(axis=0)

Academic researcher                               1692
Blockchain                                         693
Cloud infrastructure engineer                     2620
Data or business analyst                          1517
Data scientist or machine learning specialist     2210
DevOps specialist                                 3142
Developer_back-end                               18692
Developer_desktop or enterprise applications      5442
Developer_embedded applications or devices        2073
Developer_front-end                               9888
Developer_full-stack                             20928
Developer_game or graphics                         945
Developer_mobile                                  4516
Engineer_data                                     1982
Engineer_site reliability                          817
Scientist                                         1002
full_stack_.NET                                   7816
full_stack_Java                                   8158
full_stack

In [7]:
jobs_freq = skills_dev_df['DevType'].sum().reset_index()
jobs_freq.columns = ['job_type', 'freq']
jobs_freq.loc[:, 'class_weights'] = jobs_freq['freq'].sum() / (jobs_freq['job_type'].count() * jobs_freq['freq'])
jobs_freq


Unnamed: 0,job_type,freq,class_weights
0,Academic researcher,1692,3.724783
1,Blockchain,693,9.094276
2,Cloud infrastructure engineer,2620,2.405471
3,Data or business analyst,1517,4.154472
4,Data scientist or machine learning specialist,2210,2.851735
5,DevOps specialist,3142,2.005835
6,Developer_back-end,18692,0.337167
7,Developer_desktop or enterprise applications,5442,1.158091
8,Developer_embedded applications or devices,2073,3.040199
9,Developer_front-end,9888,0.637372


In [8]:
sample_weights = (jobs_freq['class_weights'].values * skills_dev_df['DevType'].values).sum(axis=1)
sample_weights = pd.Series(sample_weights, index=skills_dev_df.index, name='weights')
sample_weights

2        14.382885
3         1.107481
7         2.171847
9         2.553648
10        2.552070
           ...    
73262     0.637372
73263     3.506578
73264     2.851735
73265     3.736381
73266     1.795463
Name: weights, Length: 50423, dtype: float64

In [9]:
X = skills_dev_df.drop('DevType', axis = 1, level=0).droplevel(axis = 1, level = 0)
y = skills_dev_df['DevType']
X_train, X_test, Y_train, Y_test= train_test_split(X, y, test_size=0.3, random_state=0)
job_names = list(y.columns)

In [10]:
model = xgb.XGBClassifier(min_child_weight = 1.4, gamma = 2.5, alpha = 1, learning_rate = .3, max_depth = 6, n_estimators = 75)
model_pipeline = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('model', MultiOutputClassifier(model))
])

param_grid = {
    'model__estimator__max_depth': [2, 3, 5, 7, 10],
    'model__estimator__n_estimators': [50, 75, 100, 200, 500],
}

# grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='roc_auc', verbose=2)

In [11]:
model_pipeline.fit(X_train, Y_train, model__sample_weight=sample_weights[Y_train.index])

In [12]:
def confusion_matrix_scores(confusion_matrix):
    # tp / tp + fn
    recall = confusion_matrix[1,1] / (confusion_matrix[1,1] + confusion_matrix[1,0])
    accuracy = (confusion_matrix[1,1] + confusion_matrix[0,0])/ confusion_matrix.sum().sum()

    if not (confusion_matrix[1, 1] + confusion_matrix[0, 1]):
        precision = 0.0
    else:
        precision =  confusion_matrix[1,1] / (confusion_matrix[1,1] + confusion_matrix[0,1])

    if recall and precision:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0.0

    return {'precision' : precision, 'recall' : recall, 'f1_score' : f1 , 'accuracy' : accuracy}

In [13]:
def train_evaluate_model_features():
    all_classification_report = []

    new_pipeline = make_pipeline(StandardScaler(), MultiOutputClassifier(xgb.XGBClassifier(min_child_weight = 1.4, gamma = 2.5, alpha = 1, learning_rate = .1, max_depth = 3, n_estimators = 200)))
    new_pipeline.fit(X_train, Y_train, multioutputclassifier__sample_weight=sample_weights[Y_train.index])
    for j, evaluate_type in enumerate(['train', 'test']):
        classification_report = {}
        f1_scores = []
        y = Y_train if evaluate_type == 'train' else Y_test
        x = X_train if evaluate_type == 'train' else X_test
        multilabel_confusion_matricies = skm.multilabel_confusion_matrix(y, new_pipeline.predict(x))
        # print(multilabel_confusion_matricies)
        for n, cm in enumerate(multilabel_confusion_matricies):
            results = confusion_matrix_scores(cm)
            classification_report[job_names[n]] = results
            f1_scores.append(results['f1_score'])

        print(evaluate_type + '_f1-score: ', np.array(f1_scores).mean())
        classification_report = pd.DataFrame(classification_report).T

        classification_report.columns = pd.MultiIndex.from_product([[evaluate_type], classification_report.columns])
        if isinstance(all_classification_report, pd.DataFrame):
            all_classification_report = all_classification_report.merge(classification_report, left_index=True, right_index=True)
        else:
            all_classification_report = classification_report.copy()


    return new_pipeline, all_classification_report

In [14]:
pipline_XGBoost_model, classification_report = train_evaluate_model_features()


train_f1-score:  0.5708741601963788
test_f1-score:  0.5104158573195702


In [15]:
classification_report.loc['Mean',:] = classification_report.mean()
classification_report

Unnamed: 0_level_0,train,train,train,train,test,test,test,test
Unnamed: 0_level_1,precision,recall,f1_score,accuracy,precision,recall,f1_score,accuracy
Academic researcher,0.603316,0.399493,0.480691,0.971045,0.542763,0.324803,0.406404,0.968136
Blockchain,0.570978,0.390086,0.463508,0.988129,0.427481,0.244541,0.311111,0.983605
Cloud infrastructure engineer,0.549156,0.305019,0.392199,0.951439,0.418953,0.208178,0.278146,0.942355
Data or business analyst,0.567742,0.250712,0.347826,0.971951,0.465969,0.19181,0.271756,0.968467
Data scientist or machine learning specialist,0.691556,0.626027,0.657162,0.970705,0.58194,0.555024,0.568163,0.965029
DevOps specialist,0.61242,0.259293,0.364331,0.94345,0.539846,0.224359,0.316981,0.940173
Developer_back-end,0.575781,0.756139,0.653748,0.703337,0.562458,0.744439,0.640778,0.689958
Developer_desktop or enterprise applications,0.667671,0.233465,0.345959,0.905088,0.608187,0.189435,0.288889,0.89846
Developer_embedded applications or devices,0.672154,0.343619,0.454756,0.96671,0.616162,0.282844,0.387712,0.96179
Developer_front-end,0.637999,0.444284,0.523805,0.84103,0.578765,0.398368,0.471915,0.826601


# save logs

In [16]:
mlflow_utils_original_features = MlflowUtils(artifact_temp='../models/temp/basic_XGBoost_model')
mlflow_utils_original_features.save_data(path=LOAD_SKILLS_DEV,
                                                 training_indices=X_train.index,
                                                 testing_indices=X_test.index,
                                                 target_names=job_names,
                                                 features_names=list(X.columns))

mlflow_utils_original_features.save_model_data(name='basic_XGBoost_model',
                                               details=str(pipline_XGBoost_model),
                                               model_object=pipline_XGBoost_model)

mlflow_utils_original_features.save_matrices(classification_report)

XGBoost_metrics = classification_report['test'].loc['Mean',:]
original_features_metrics = pd.Series(XGBoost_metrics).to_dict()
mlflow_utils_original_features.save_run_details('3.basic_XGBoost_Model', metrics=XGBoost_metrics)

In [17]:
MlflowUtils.get_runs()

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.f1_score,metrics.accuracy,metrics.recall,metrics.precision,tags.mlflow.runName,tags.mlflow.source.type,tags.mlflow.user,tags.mlflow.source.name
0,1322b09ddb9d426cad3ac3daf4660537,958618704808840933,FINISHED,file:///E:/M_Hassan/AI_Work/StackOverflow_skil...,2023-08-02 15:51:23.276000+00:00,2023-08-02 15:51:23.315000+00:00,0.510416,0.902808,0.509404,0.555285,3.basic_XGBoost_Model,LOCAL,mhassan,C:\Users\salshahed\AppData\Roaming\Python\Pyth...
1,43195e239dba4a019673403231ba12a5,958618704808840933,FINISHED,file:///E:/M_Hassan/AI_Work/StackOverflow_skil...,2023-08-02 15:49:13.770000+00:00,2023-08-02 15:49:13.802000+00:00,0.376317,0.893628,0.326518,0.507593,2.basic_model_with_clustered_features,LOCAL,mhassan,C:\Users\salshahed\AppData\Roaming\Python\Pyth...
2,e2c46294be34495486dc6d762fafe67c,958618704808840933,FINISHED,file:///E:/M_Hassan/AI_Work/StackOverflow_skil...,2023-08-02 15:49:13.701000+00:00,2023-08-02 15:49:13.732000+00:00,0.450255,0.898233,0.409184,0.535965,1.basic_model_with_original_features,LOCAL,mhassan,C:\Users\salshahed\AppData\Roaming\Python\Pyth...
