In [1]:
import sys
import os

SCRIPT_DIR = os.path.dirname(os.path.abspath('src'))
sys.path.append(os.path.dirname(SCRIPT_DIR))

In [2]:
# Load packages
import pandas as pd
import numpy as np

import sklearn.metrics as skm
import sklearn.preprocessing as skp
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from src.utils import MlflowUtils
from sklearn.pipeline import make_pipeline

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff


In [3]:
LOAD_PATH = '../data/interim/'
LOAD_SKILLS_DEV = '6.0-Engineered_data-split_roles-cluster_skills.pkl'
SAVE_DF_NAME = '7.0-Chosen_features_and_roles.pkl'


# Load Data

In [4]:
skills_dev_df = pd.read_pickle(LOAD_PATH + LOAD_SKILLS_DEV)

In [5]:
skills_dev_df

Unnamed: 0_level_0,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,...,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType
Unnamed: 0_level_1,APL,Assembly,Bash/Shell,C,C#,C++,COBOL,Clojure,Crystal,Dart,...,full_stack_Java,full_stack_.JavaScript,full_stack_PHP,full_stack_python,back_end_Java,back_end_.JavaScript,back_end_.NET,back_end_C++,back_end_python,back_end_PHP
2,0.0,0.0,0.0,0.0,0.75,0.75,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,1.50,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,0.5,0.0,0.5,1.0,0.00,1.00,0.0,0.0,0.0,0.0,...,1,1,1,0,0,0,0,0,0,0
9,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
10,0.0,0.0,1.0,0.0,1.50,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73262,0.0,0.5,0.0,0.5,0.00,0.75,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
73263,0.0,0.0,1.5,0.0,0.00,0.00,0.0,0.0,0.0,1.0,...,0,0,0,0,0,1,0,1,1,1
73264,0.0,0.0,1.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
73265,0.0,0.0,0.0,0.0,0.25,0.00,0.0,0.0,0.0,0.0,...,0,0,1,1,0,0,0,0,0,0


# 1. Choosing features
we need to choose between original features or clusters features, which one gives us the best results in classification to proceed with them in hero model

we will use class weights formula:
# $$ w_j=\frac{n\_samples}{(n\_classes * n\_samples_j)} $$

In [6]:
jobs_freq = skills_dev_df['DevType'].sum().reset_index()
jobs_freq.columns = ['job_type', 'freq']
jobs_freq.loc[:, 'class_weights'] = jobs_freq['freq'].sum() / (jobs_freq['job_type'].count() * jobs_freq['freq'])
jobs_freq

Unnamed: 0,job_type,freq,class_weights
0,Academic researcher,1692,3.253805
1,Blockchain,693,7.944354
2,Cloud infrastructure engineer,2620,2.101312
3,Data or business analyst,1517,3.629161
4,Data scientist or machine learning specialist,2210,2.491148
5,Database administrator,1405,3.918461
6,DevOps specialist,3142,1.752208
7,Developer_QA or test,1297,4.244747
8,Developer_back-end,18692,0.294534
9,Developer_desktop or enterprise applications,5442,1.011657


In [7]:
sample_weights = (jobs_freq['class_weights'].values * skills_dev_df['DevType'].values).sum(axis = 1)
sample_weights = pd.Series(sample_weights, index=skills_dev_df.index, name='weights')
sample_weights

2        12.564248
3         0.967446
7         1.897228
9         2.230753
10        2.229374
           ...    
73262     0.556780
73263     3.063190
73264     2.491148
73265     6.369090
73266     1.568437
Name: weights, Length: 50423, dtype: float64

------------

In [8]:
X = skills_dev_df.drop('DevType', axis = 1, level=0).droplevel(axis = 1, level = 0)
y = skills_dev_df['DevType']
clustered_features = skills_dev_df['clustered_skills'].columns
original_features = X.drop(columns=clustered_features).columns

In [9]:
X_train, X_test, Y_train, Y_test= train_test_split(X, y, test_size=0.3, random_state=0)
# scaler = skp.StandardScaler()
# scaler.fit(X_train.values)
# X_train = pd.DataFrame(scaler.transform(X_train.values), columns=X_train.columns)
# X_test = pd.DataFrame(scaler.transform(X_test.values), columns = X_train.columns)
# X_train

In [10]:
# sample_weights = sample_weights[Y_train.index]
# sample_weights= sample_weights / sample_weights.sum()
# sample_weights

In [11]:
def confusion_matrix_scores(confusion_matrix):
    # tp / tp + fn
    recall = confusion_matrix[1,1] / (confusion_matrix[1,1] + confusion_matrix[1,0])
    accuracy = (confusion_matrix[1,1] + confusion_matrix[0,0])/ confusion_matrix.sum().sum()

    if not (confusion_matrix[1, 1] + confusion_matrix[0, 1]):
        precision = 0.0
    else:
        precision =  confusion_matrix[1,1] / (confusion_matrix[1,1] + confusion_matrix[0,1])

    if recall and precision:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0.0

    return {'precision' : precision, 'recall' : recall, 'f1_score' : f1 , 'accuracy' : accuracy}

In [12]:
features_types = ({'original_features' : original_features  , 'clustered_features' : clustered_features})
job_names =jobs_freq['job_type'].values


In [13]:
def train_evaluate_model_features(chosen_features='original_features'):
    all_classification_report = []
    features = features_types[chosen_features]
    X_train_features = X_train[features].copy()
    X_test_features = X_test[features].copy()
    model = make_pipeline(skp.StandardScaler(), MultiOutputClassifier(LogisticRegression()))
    model.fit(X_train_features, Y_train, multioutputclassifier__sample_weight=sample_weights[Y_train.index])
    for j, evaluate_type in enumerate(['train', 'test']):
        classification_report = {}
        f1_scores = []
        X_ = X_test_features if j else X_train_features
        y_ = Y_test if j else Y_train
        multilabel_confusion_matricies = skm.multilabel_confusion_matrix(y_, model.predict(X_))
        # print(multilabel_confusion_matricies)
        for n, cm in enumerate(multilabel_confusion_matricies):
            results = confusion_matrix_scores(cm)
            classification_report[job_names[n]] = results
            f1_scores.append(results['f1_score'])

        print(chosen_features + '-' + evaluate_type + '_f1-score: ', np.array(f1_scores).mean())
        classification_report = pd.DataFrame(classification_report).T

        classification_report.columns = pd.MultiIndex.from_product([[chosen_features + '-' + evaluate_type], classification_report.columns])
        if isinstance(all_classification_report, pd.DataFrame):
            all_classification_report = all_classification_report.merge(classification_report, left_index=True, right_index=True)
        else:
            all_classification_report = classification_report.copy()


    return model, all_classification_report



In [14]:
original_features_model, classification_report_original_features = train_evaluate_model_features('original_features')

original_features-train_f1-score:  0.3972966796399631
original_features-test_f1-score:  0.38754417764115096


In [15]:
clustered_features_model, classification_report_clustered_features = train_evaluate_model_features('clustered_features')

clustered_features-train_f1-score:  0.32063302026340895
clustered_features-test_f1-score:  0.3195561466830471


In [16]:
classification_report_original_features.loc['Mean',:] = classification_report_original_features.mean()
classification_report_original_features

Unnamed: 0_level_0,original_features-train,original_features-train,original_features-train,original_features-train,original_features-test,original_features-test,original_features-test,original_features-test
Unnamed: 0_level_1,precision,recall,f1_score,accuracy,precision,recall,f1_score,accuracy
Academic researcher,0.463415,0.224662,0.302617,0.965265,0.468619,0.220472,0.299866,0.965426
Blockchain,0.441176,0.258621,0.326087,0.985947,0.418803,0.213974,0.283237,0.983605
Cloud infrastructure engineer,0.429501,0.218423,0.28958,0.944951,0.363636,0.173482,0.234899,0.93971
Data or business analyst,0.428125,0.130104,0.199563,0.968863,0.392857,0.118534,0.182119,0.967343
Data scientist or machine learning specialist,0.64845,0.502211,0.566038,0.965464,0.591603,0.494418,0.538662,0.964897
Database administrator,0.26087,0.030151,0.054054,0.970252,0.26,0.031707,0.056522,0.97131
DevOps specialist,0.493235,0.181777,0.265651,0.937188,0.494475,0.191239,0.275809,0.937859
Developer_QA or test,0.333333,0.001112,0.002217,0.974501,0.0,0.0,0.0,0.973557
Developer_back-end,0.591518,0.614549,0.602814,0.700051,0.589631,0.619327,0.604114,0.698486
Developer_desktop or enterprise applications,0.612589,0.182082,0.280723,0.899677,0.587368,0.169399,0.262959,0.896609


In [17]:
classification_report_clustered_features.loc['Mean',:] = classification_report_clustered_features.mean()
classification_report_clustered_features

Unnamed: 0_level_0,clustered_features-train,clustered_features-train,clustered_features-train,clustered_features-train,clustered_features-test,clustered_features-test,clustered_features-test,clustered_features-test
Unnamed: 0_level_1,precision,recall,f1_score,accuracy,precision,recall,f1_score,accuracy
Academic researcher,0.389685,0.114865,0.17743,0.964274,0.426667,0.125984,0.194529,0.964963
Blockchain,0.19403,0.028017,0.048964,0.985692,0.142857,0.017467,0.031128,0.983539
Cloud infrastructure engineer,0.4,0.144512,0.212318,0.944923,0.32526,0.116481,0.171533,0.939975
Data or business analyst,0.353591,0.060779,0.103728,0.968665,0.4,0.060345,0.104869,0.968401
Data scientist or machine learning specialist,0.635284,0.473152,0.542361,0.964189,0.608696,0.4689,0.52973,0.965492
Database administrator,0.111111,0.001005,0.001992,0.971612,0.25,0.002439,0.004831,0.972764
DevOps specialist,0.459302,0.071623,0.123922,0.936707,0.369128,0.058761,0.101382,0.935546
Developer_QA or test,0.0,0.0,0.0,0.97453,0.0,0.0,0.0,0.973689
Developer_back-end,0.573663,0.575155,0.574408,0.684327,0.570347,0.576437,0.573376,0.681364
Developer_desktop or enterprise applications,0.617918,0.123584,0.205973,0.897552,0.647059,0.126897,0.212183,0.897402



## 1. From the above results we will choose original features as our features
## 2. we need to also Exclude roles that have f1-score less than 0.1

In [18]:
drop_roles = classification_report_original_features[classification_report_original_features['original_features-test']['recall'] < .1].index
drop_roles

Index(['Database administrator', 'Developer_QA or test',
       'Security professional', 'Senior Executive (C-Suite_VP_etc.)',
       'System administrator'],
      dtype='object')

In [19]:
classification_report_original_features = classification_report_original_features.drop(drop_roles)
classification_report_original_features.loc['Mean',:] = classification_report_original_features.mean()
classification_report_original_features

Unnamed: 0_level_0,original_features-train,original_features-train,original_features-train,original_features-train,original_features-test,original_features-test,original_features-test,original_features-test
Unnamed: 0_level_1,precision,recall,f1_score,accuracy,precision,recall,f1_score,accuracy
Academic researcher,0.463415,0.224662,0.302617,0.965265,0.468619,0.220472,0.299866,0.965426
Blockchain,0.441176,0.258621,0.326087,0.985947,0.418803,0.213974,0.283237,0.983605
Cloud infrastructure engineer,0.429501,0.218423,0.28958,0.944951,0.363636,0.173482,0.234899,0.93971
Data or business analyst,0.428125,0.130104,0.199563,0.968863,0.392857,0.118534,0.182119,0.967343
Data scientist or machine learning specialist,0.64845,0.502211,0.566038,0.965464,0.591603,0.494418,0.538662,0.964897
DevOps specialist,0.493235,0.181777,0.265651,0.937188,0.494475,0.191239,0.275809,0.937859
Developer_back-end,0.591518,0.614549,0.602814,0.700051,0.589631,0.619327,0.604114,0.698486
Developer_desktop or enterprise applications,0.612589,0.182082,0.280723,0.899677,0.587368,0.169399,0.262959,0.896609
Developer_embedded applications or devices,0.587762,0.255961,0.356619,0.962687,0.561462,0.261206,0.35654,0.959675
Developer_front-end,0.616557,0.390297,0.478004,0.832247,0.588267,0.374915,0.457961,0.827395


In [20]:
classification_report_clustered_features = classification_report_clustered_features.drop(drop_roles)
classification_report_clustered_features.loc['Mean',:] = classification_report_clustered_features.mean()
classification_report_clustered_features

Unnamed: 0_level_0,clustered_features-train,clustered_features-train,clustered_features-train,clustered_features-train,clustered_features-test,clustered_features-test,clustered_features-test,clustered_features-test
Unnamed: 0_level_1,precision,recall,f1_score,accuracy,precision,recall,f1_score,accuracy
Academic researcher,0.389685,0.114865,0.17743,0.964274,0.426667,0.125984,0.194529,0.964963
Blockchain,0.19403,0.028017,0.048964,0.985692,0.142857,0.017467,0.031128,0.983539
Cloud infrastructure engineer,0.4,0.144512,0.212318,0.944923,0.32526,0.116481,0.171533,0.939975
Data or business analyst,0.353591,0.060779,0.103728,0.968665,0.4,0.060345,0.104869,0.968401
Data scientist or machine learning specialist,0.635284,0.473152,0.542361,0.964189,0.608696,0.4689,0.52973,0.965492
DevOps specialist,0.459302,0.071623,0.123922,0.936707,0.369128,0.058761,0.101382,0.935546
Developer_back-end,0.573663,0.575155,0.574408,0.684327,0.570347,0.576437,0.573376,0.681364
Developer_desktop or enterprise applications,0.617918,0.123584,0.205973,0.897552,0.647059,0.126897,0.212183,0.897402
Developer_embedded applications or devices,0.551724,0.145863,0.230727,0.960704,0.55618,0.153014,0.24,0.958551
Developer_front-end,0.5966,0.333429,0.42778,0.824456,0.582385,0.330387,0.421601,0.823693


In [21]:
job_names = classification_report_clustered_features[:-1].index
job_names = list(job_names)
job_names

['Academic researcher',
 'Blockchain',
 'Cloud infrastructure engineer',
 'Data or business analyst',
 'Data scientist or machine learning specialist',
 'DevOps specialist',
 'Developer_back-end',
 'Developer_desktop or enterprise applications',
 'Developer_embedded applications or devices',
 'Developer_front-end',
 'Developer_full-stack',
 'Developer_game or graphics',
 'Developer_mobile',
 'Engineer_data',
 'Engineer_site reliability',
 'Scientist',
 'full_stack_.NET',
 'full_stack_Java',
 'full_stack_.JavaScript',
 'full_stack_PHP',
 'full_stack_python',
 'back_end_Java',
 'back_end_.JavaScript',
 'back_end_.NET',
 'back_end_C++',
 'back_end_python',
 'back_end_PHP']

# 6. Concat skills_df with DevType

In [22]:
skills_df = skills_dev_df.drop('clustered_skills', axis = 1, level=0).drop('DevType', axis = 1, level = 0)

In [23]:
dev_df = skills_dev_df['DevType'].drop(drop_roles, axis = 1)
dev_df.columns = pd.MultiIndex.from_product([['DevType'],job_names])


In [24]:
skills_dev_df = pd.concat([skills_df, dev_df], axis=1, join='inner')
skills_dev_df

Unnamed: 0_level_0,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,...,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType
Unnamed: 0_level_1,APL,Assembly,Bash/Shell,C,C#,C++,COBOL,Clojure,Crystal,Dart,...,full_stack_Java,full_stack_.JavaScript,full_stack_PHP,full_stack_python,back_end_Java,back_end_.JavaScript,back_end_.NET,back_end_C++,back_end_python,back_end_PHP
2,0.0,0.0,0.0,0.0,0.75,0.75,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,1.50,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,0.5,0.0,0.5,1.0,0.00,1.00,0.0,0.0,0.0,0.0,...,1,1,1,0,0,0,0,0,0,0
9,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
10,0.0,0.0,1.0,0.0,1.50,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73262,0.0,0.5,0.0,0.5,0.00,0.75,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
73263,0.0,0.0,1.5,0.0,0.00,0.00,0.0,0.0,0.0,1.0,...,0,0,0,0,0,1,0,1,1,1
73264,0.0,0.0,1.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
73265,0.0,0.0,0.0,0.0,0.25,0.00,0.0,0.0,0.0,0.0,...,0,0,1,1,0,0,0,0,0,0


In [25]:
skills_dev_df.to_pickle(LOAD_PATH + SAVE_DF_NAME)


# Save logs

## 1. original features

In [26]:
classification_report_original_features

Unnamed: 0_level_0,original_features-train,original_features-train,original_features-train,original_features-train,original_features-test,original_features-test,original_features-test,original_features-test
Unnamed: 0_level_1,precision,recall,f1_score,accuracy,precision,recall,f1_score,accuracy
Academic researcher,0.463415,0.224662,0.302617,0.965265,0.468619,0.220472,0.299866,0.965426
Blockchain,0.441176,0.258621,0.326087,0.985947,0.418803,0.213974,0.283237,0.983605
Cloud infrastructure engineer,0.429501,0.218423,0.28958,0.944951,0.363636,0.173482,0.234899,0.93971
Data or business analyst,0.428125,0.130104,0.199563,0.968863,0.392857,0.118534,0.182119,0.967343
Data scientist or machine learning specialist,0.64845,0.502211,0.566038,0.965464,0.591603,0.494418,0.538662,0.964897
DevOps specialist,0.493235,0.181777,0.265651,0.937188,0.494475,0.191239,0.275809,0.937859
Developer_back-end,0.591518,0.614549,0.602814,0.700051,0.589631,0.619327,0.604114,0.698486
Developer_desktop or enterprise applications,0.612589,0.182082,0.280723,0.899677,0.587368,0.169399,0.262959,0.896609
Developer_embedded applications or devices,0.587762,0.255961,0.356619,0.962687,0.561462,0.261206,0.35654,0.959675
Developer_front-end,0.616557,0.390297,0.478004,0.832247,0.588267,0.374915,0.457961,0.827395


In [27]:
mlflow_utils_original_features = MlflowUtils(artifact_temp='../models/temp/basic_model_original_features')
mlflow_utils_original_features.save_data(path=LOAD_SKILLS_DEV,
                                                 training_indices=X_train.index,
                                                 testing_indices=X_test.index,
                                                 target_names=job_names,
                                                 features_names=original_features)

mlflow_utils_original_features.save_model_data(name='basic_model_original_features',
                                               details=str(original_features_model),
                                               model_object=original_features_model)

mlflow_utils_original_features.save_matrices(classification_report_original_features)

original_features_metrics = classification_report_original_features['original_features-test'].loc['Mean',:]
original_features_metrics = pd.Series(original_features_metrics).to_dict()
mlflow_utils_original_features.save_run_details('1.basic_model_with_original_features', metrics=original_features_metrics)


2023/08/08 15:22:48 INFO mlflow.tracking.fluent: Experiment with name 'skills_rec_analysis' does not exist. Creating a new experiment.


In [28]:
mlflow_utils_clustered_features = MlflowUtils(artifact_temp='../models/temp/basic_model_clustered_features')
mlflow_utils_clustered_features.save_data(path=LOAD_SKILLS_DEV,
                                                 training_indices=X_train.index,
                                                 testing_indices=X_test.index,
                                                 target_names=job_names,
                                                 features_names=clustered_features)

mlflow_utils_clustered_features.save_model_data(name='basic_model_clustered',
                                               details=str(clustered_features_model),
                                               model_object=clustered_features_model)

mlflow_utils_clustered_features.save_matrices(classification_report_clustered_features)

clustered_features_metrics = classification_report_clustered_features['clustered_features-test'].loc['Mean',:]
clustered_features_metrics = pd.Series(clustered_features_metrics).to_dict()


mlflow_utils_clustered_features.save_run_details('2.basic_model_with_clustered_features', metrics=dict(clustered_features_metrics))

In [29]:
MlflowUtils.get_run_names()

['2.basic_model_with_clustered_features',
 '1.basic_model_with_original_features']

In [30]:
MlflowUtils.get_runs()

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.accuracy,metrics.f1_score,metrics.precision,metrics.recall,tags.mlflow.source.type,tags.mlflow.source.name,tags.mlflow.runName,tags.mlflow.user
0,f6cf7dfa383f4a89811010639556c9d7,907184637109318169,FINISHED,file:///E:/M_Hassan/AI_Work/StackOverflow_skil...,2023-08-08 13:22:48.741000+00:00,2023-08-08 13:22:48.772000+00:00,0.893628,0.376317,0.507593,0.326518,LOCAL,C:\Users\salshahed\AppData\Roaming\Python\Pyth...,2.basic_model_with_clustered_features,mhassan
1,f0c59d4307f649b1a93cd2fb922448a4,907184637109318169,FINISHED,file:///E:/M_Hassan/AI_Work/StackOverflow_skil...,2023-08-08 13:22:48.672000+00:00,2023-08-08 13:22:48.710000+00:00,0.898233,0.450255,0.535965,0.409184,LOCAL,C:\Users\salshahed\AppData\Roaming\Python\Pyth...,1.basic_model_with_original_features,mhassan


In [31]:
run_id = MlflowUtils.get_run_id_by_metrix(metrix_name='recall')
run_id

'f6cf7dfa383f4a89811010639556c9d7'

In [32]:
logged_data = MlflowUtils.fetch_logged_data(tracking_uri='../models/runs', run_id=run_id)

In [33]:
logged_data.keys()

dict_keys(['metrics', 'data_details', 'matrices', 'model'])

In [34]:
model=logged_data['model']['model_object']

# as we see from the above dataframe, original features outperform clustered features in all metricises especially recall & f1_score that we care more
## 1. so will use the original features not all the clustered features
## 2. And we will use These metricises as reference for any further complex model