In [1]:
import sys
import os

SCRIPT_DIR = os.path.dirname(os.path.abspath('src'))
sys.path.append(os.path.dirname(SCRIPT_DIR))

In [2]:
# Load packages
import pandas as pd
import numpy as np

import sklearn.metrics as skm
import sklearn.preprocessing as skp
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from src.utils import MlflowUtils
from sklearn.pipeline import make_pipeline
from modelingUtlis import ModelingUtils
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff


In [3]:
LOAD_PATH = '../data/interim/'
LOAD_SKILLS_DEV = '6.0-Engineered_data-split_roles-cluster_skills.pkl'
SAVE_DF_NAME = '7.0-Chosen_features_and_roles.pkl'
SAVE_DF_NAME_CSV = '7.0-Chosen_features_and_roles.csv'

# Load Data

In [4]:
skills_dev_df = pd.read_pickle(LOAD_PATH + LOAD_SKILLS_DEV)

In [5]:
skills_dev_df

Unnamed: 0_level_0,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,...,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType
Unnamed: 0_level_1,APL,Assembly,Bash/Shell,C,C#,C++,COBOL,Clojure,Crystal,Dart,...,full_stack_Java,full_stack_.JavaScript,full_stack_PHP,full_stack_python,back_end_Java,back_end_.JavaScript,back_end_.NET,back_end_C++,back_end_python,back_end_PHP
2,0.0,0.0,0.0,0.0,0.75,0.75,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,1.50,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,0.5,0.0,0.5,1.0,0.00,1.00,0.0,0.0,0.0,0.0,...,1,1,1,0,0,0,0,0,0,0
9,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
10,0.0,0.0,1.0,0.0,1.50,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73262,0.0,0.5,0.0,0.5,0.00,0.75,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
73263,0.0,0.0,1.5,0.0,0.00,0.00,0.0,0.0,0.0,1.0,...,0,0,0,0,0,1,0,1,1,1
73264,0.0,0.0,1.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
73265,0.0,0.0,0.0,0.0,0.25,0.00,0.0,0.0,0.0,0.0,...,0,0,1,1,0,0,0,0,0,0


# 1. Choosing features
we need to choose between original features or clusters features, which one gives us the best results in classification to proceed with them in hero model

we will use class weights formula:
# $$ w_j=\frac{n\_samples}{(n\_classes * n\_samples_j)} $$

In [8]:
basic_model = make_pipeline(skp.StandardScaler(), MultiOutputClassifier(LogisticRegression()))


In [9]:
original_df = skills_dev_df.drop('clustered_skills', level = 0, axis = 1)
original_modeling_utils = ModelingUtils(original_df, basic_model)
original_features_model, classification_report_original_features = original_modeling_utils.train_evaluate_model_features()

train_f1-score:  0.3960388235971532
test_f1-score:  0.39148719761665596


In [10]:
drop_columns = original_df.drop('DevType', level=0, axis=1).columns
clustered_df = skills_dev_df.drop(drop_columns, axis = 1)
clustered_modeling_utils = ModelingUtils(clustered_df, basic_model)
clustered_features_model, classification_report_clustered_features = clustered_modeling_utils.train_evaluate_model_features()

train_f1-score:  0.3203918078121181
test_f1-score:  0.3183727014803502


In [11]:
classification_report_original_features.loc['Mean',:] = classification_report_original_features.mean()
classification_report_original_features

Unnamed: 0_level_0,train,train,train,train,test,test,test,test
Unnamed: 0_level_1,precision,recall,f1_score,accuracy,precision,recall,f1_score,accuracy
Academic researcher,0.451994,0.223848,0.299413,0.964498,0.452941,0.236923,0.311111,0.966194
Blockchain,0.432836,0.262206,0.326577,0.985175,0.485714,0.242857,0.32381,0.985922
Cloud infrastructure engineer,0.429658,0.215958,0.28744,0.944442,0.3861,0.189753,0.254453,0.941905
Data or business analyst,0.393064,0.113617,0.17628,0.96849,0.434343,0.134375,0.205251,0.966987
Data scientist or machine learning specialist,0.642804,0.498569,0.561573,0.966283,0.613941,0.4946,0.547847,0.962526
Database administrator,0.284404,0.027169,0.0496,0.970547,0.269231,0.026515,0.048276,0.972638
DevOps specialist,0.50258,0.192642,0.278524,0.93745,0.469027,0.172638,0.252381,0.937742
Developer_QA or test,0.333333,0.000968,0.001931,0.974365,0.5,0.003788,0.007519,0.973828
Developer_back-end,0.592902,0.62167,0.606946,0.70173,0.597698,0.609333,0.60346,0.70229
Developer_desktop or enterprise applications,0.6,0.175177,0.27118,0.898056,0.595808,0.185116,0.28247,0.899772


In [12]:
classification_report_clustered_features.loc['Mean',:] = classification_report_clustered_features.mean()
classification_report_clustered_features

Unnamed: 0_level_0,train,train,train,train,test,test,test,test
Unnamed: 0_level_1,precision,recall,f1_score,accuracy,precision,recall,f1_score,accuracy
Academic researcher,0.389549,0.121033,0.184685,0.96413,0.46,0.136499,0.210526,0.965689
Blockchain,0.177215,0.026316,0.045827,0.985558,0.15,0.018634,0.033149,0.982596
Cloud infrastructure engineer,0.386454,0.139035,0.204498,0.943916,0.333333,0.127135,0.184066,0.940925
Data or business analyst,0.375587,0.065681,0.11181,0.968515,0.388889,0.070234,0.11898,0.96907
Data scientist or machine learning specialist,0.636295,0.472067,0.542014,0.964625,0.585139,0.45,0.508748,0.9637
Database administrator,0.285714,0.001794,0.003565,0.972305,0.333333,0.003448,0.006826,0.971059
DevOps specialist,0.453039,0.065365,0.114246,0.937005,0.45098,0.07267,0.12517,0.936052
Developer_QA or test,0.0,0.0,0.0,0.974039,0.0,0.0,0.0,0.975236
Developer_back-end,0.573565,0.569994,0.571774,0.684081,0.583893,0.579228,0.581551,0.688712
Developer_desktop or enterprise applications,0.609921,0.124482,0.206765,0.897171,0.636771,0.129562,0.215315,0.897066



## 1. From the above results we will choose original features as our features
## 2. we need to also Exclude roles that have f1-score less than 0.1

In [13]:
drop_roles = classification_report_original_features[classification_report_original_features['test']['recall'] < .1].index
drop_roles

Index(['Database administrator', 'Developer_QA or test',
       'Security professional', 'Senior Executive (C-Suite_VP_etc.)',
       'System administrator'],
      dtype='object')

In [14]:
classification_report_original_features = classification_report_original_features.drop(drop_roles)
classification_report_original_features.loc['Mean',:] = classification_report_original_features.mean()
classification_report_original_features

Unnamed: 0_level_0,train,train,train,train,test,test,test,test
Unnamed: 0_level_1,precision,recall,f1_score,accuracy,precision,recall,f1_score,accuracy
Academic researcher,0.451994,0.223848,0.299413,0.964498,0.452941,0.236923,0.311111,0.966194
Blockchain,0.432836,0.262206,0.326577,0.985175,0.485714,0.242857,0.32381,0.985922
Cloud infrastructure engineer,0.429658,0.215958,0.28744,0.944442,0.3861,0.189753,0.254453,0.941905
Data or business analyst,0.393064,0.113617,0.17628,0.96849,0.434343,0.134375,0.205251,0.966987
Data scientist or machine learning specialist,0.642804,0.498569,0.561573,0.966283,0.613941,0.4946,0.547847,0.962526
DevOps specialist,0.50258,0.192642,0.278524,0.93745,0.469027,0.172638,0.252381,0.937742
Developer_back-end,0.592902,0.62167,0.606946,0.70173,0.597698,0.609333,0.60346,0.70229
Developer_desktop or enterprise applications,0.6,0.175177,0.27118,0.898056,0.595808,0.185116,0.28247,0.899772
Developer_embedded applications or devices,0.594223,0.260084,0.361809,0.962217,0.580488,0.288835,0.385737,0.962427
Developer_front-end,0.612361,0.381277,0.469948,0.830995,0.593042,0.373598,0.458412,0.828294


In [15]:
classification_report_clustered_features = classification_report_clustered_features.drop(drop_roles)
classification_report_clustered_features.loc['Mean',:] = classification_report_clustered_features.mean()
classification_report_clustered_features

Unnamed: 0_level_0,train,train,train,train,test,test,test,test
Unnamed: 0_level_1,precision,recall,f1_score,accuracy,precision,recall,f1_score,accuracy
Academic researcher,0.389549,0.121033,0.184685,0.96413,0.46,0.136499,0.210526,0.965689
Blockchain,0.177215,0.026316,0.045827,0.985558,0.15,0.018634,0.033149,0.982596
Cloud infrastructure engineer,0.386454,0.139035,0.204498,0.943916,0.333333,0.127135,0.184066,0.940925
Data or business analyst,0.375587,0.065681,0.11181,0.968515,0.388889,0.070234,0.11898,0.96907
Data scientist or machine learning specialist,0.636295,0.472067,0.542014,0.964625,0.585139,0.45,0.508748,0.9637
DevOps specialist,0.453039,0.065365,0.114246,0.937005,0.45098,0.07267,0.12517,0.936052
Developer_back-end,0.573565,0.569994,0.571774,0.684081,0.583893,0.579228,0.581551,0.688712
Developer_desktop or enterprise applications,0.609921,0.124482,0.206765,0.897171,0.636771,0.129562,0.215315,0.897066
Developer_embedded applications or devices,0.552106,0.15,0.235907,0.960043,0.568807,0.150121,0.237548,0.960418
Developer_front-end,0.597786,0.328143,0.423702,0.825332,0.601869,0.323781,0.421053,0.823869


In [16]:
job_names = classification_report_clustered_features[:-1].index
job_names = list(job_names)
job_names

['Academic researcher',
 'Blockchain',
 'Cloud infrastructure engineer',
 'Data or business analyst',
 'Data scientist or machine learning specialist',
 'DevOps specialist',
 'Developer_back-end',
 'Developer_desktop or enterprise applications',
 'Developer_embedded applications or devices',
 'Developer_front-end',
 'Developer_full-stack',
 'Developer_game or graphics',
 'Developer_mobile',
 'Engineer_data',
 'Engineer_site reliability',
 'Scientist',
 'full_stack_.NET',
 'full_stack_Java',
 'full_stack_.JavaScript',
 'full_stack_PHP',
 'full_stack_python',
 'back_end_Java',
 'back_end_.JavaScript',
 'back_end_.NET',
 'back_end_C++',
 'back_end_python',
 'back_end_PHP']

# 6. Concat skills_df with DevType

In [17]:
skills_df = skills_dev_df.drop('clustered_skills', axis = 1, level=0).drop('DevType', axis = 1, level = 0)

In [18]:
dev_df = skills_dev_df['DevType'].drop(drop_roles, axis = 1)
dev_df.columns = pd.MultiIndex.from_product([['DevType'],job_names])


In [19]:
skills_dev_df = pd.concat([skills_df, dev_df], axis=1, join='inner')
skills_dev_df

Unnamed: 0_level_0,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,...,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType
Unnamed: 0_level_1,APL,Assembly,Bash/Shell,C,C#,C++,COBOL,Clojure,Crystal,Dart,...,full_stack_Java,full_stack_.JavaScript,full_stack_PHP,full_stack_python,back_end_Java,back_end_.JavaScript,back_end_.NET,back_end_C++,back_end_python,back_end_PHP
2,0.0,0.0,0.0,0.0,0.75,0.75,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,1.50,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,0.5,0.0,0.5,1.0,0.00,1.00,0.0,0.0,0.0,0.0,...,1,1,1,0,0,0,0,0,0,0
9,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
10,0.0,0.0,1.0,0.0,1.50,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73262,0.0,0.5,0.0,0.5,0.00,0.75,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
73263,0.0,0.0,1.5,0.0,0.00,0.00,0.0,0.0,0.0,1.0,...,0,0,0,0,0,1,0,1,1,1
73264,0.0,0.0,1.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
73265,0.0,0.0,0.0,0.0,0.25,0.00,0.0,0.0,0.0,0.0,...,0,0,1,1,0,0,0,0,0,0


In [18]:
skills_dev_df.to_pickle(LOAD_PATH + SAVE_DF_NAME)


In [20]:
skills_dev_df.to_csv(LOAD_PATH + SAVE_DF_NAME_CSV)

# Save logs

## 1. original features

In [19]:
classification_report_original_features

Unnamed: 0_level_0,train,train,train,train,test,test,test,test
Unnamed: 0_level_1,precision,recall,f1_score,accuracy,precision,recall,f1_score,accuracy
Academic researcher,0.462963,0.220426,0.298656,0.96507,0.39881,0.202417,0.268537,0.963808
Blockchain,0.440729,0.272556,0.336818,0.985845,0.363636,0.223602,0.276923,0.981358
Cloud infrastructure engineer,0.431099,0.21778,0.289375,0.943626,0.353571,0.200405,0.255814,0.942885
Data or business analyst,0.405914,0.124485,0.190536,0.968194,0.37037,0.131579,0.194175,0.96708
Data scientist or machine learning specialist,0.63893,0.485601,0.551813,0.965368,0.656863,0.457859,0.539597,0.965989
DevOps specialist,0.504124,0.193357,0.279508,0.937503,0.472103,0.179445,0.260047,0.937928
Developer_back-end,0.596681,0.621157,0.608673,0.703134,0.578693,0.615301,0.596436,0.694596
Developer_desktop or enterprise applications,0.591538,0.166781,0.2602,0.897516,0.649231,0.194829,0.299716,0.902231
Developer_embedded applications or devices,0.592643,0.265406,0.366625,0.96274,0.625616,0.292627,0.398744,0.962023
Developer_front-end,0.617082,0.387913,0.476369,0.833829,0.582219,0.364892,0.448621,0.819633


In [20]:
original_features = original_df.drop('DevType',level=0, axis=1).droplevel(level=0,axis=1).columns
original_modeling_utils.save_results(all_classification_report=classification_report_original_features,
                                     model_name='Logistic-Regression_Original-Features_1.0.0',
                                     data_path=LOAD_SKILLS_DEV, features=original_features)


clustered_features = clustered_df.drop('DevType',level=0, axis=1).droplevel(level=0,axis=1).columns
clustered_modeling_utils.save_results(all_classification_report=classification_report_clustered_features,
                                     model_name='Logistic-Regression_Clustered-Features_1.0.0',
                                     data_path=LOAD_SKILLS_DEV, features=clustered_features)

2023/08/17 12:25:36 INFO mlflow.tracking.fluent: Experiment with name 'skills_rec_analysis' does not exist. Creating a new experiment.


In [21]:
MlflowUtils.get_runs()

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.accuracy,metrics.recall,metrics.f1_score,metrics.precision,tags.mlflow.user,tags.mlflow.source.type,tags.mlflow.runName,tags.mlflow.source.name
0,b4928cc3e3bf4f21b8b9138310447d2c,735982700926372623,FINISHED,file:///D:/protofolio_projects/Dev_skills_reco...,2023-08-17 09:25:36.899000+00:00,2023-08-17 09:25:36.934000+00:00,0.894298,0.324855,0.378904,0.522859,mahmo,LOCAL,Logistic-Regression_Clustered-Features_1.0.0,C:\Users\mahmo\anaconda3\envs\Rec-skills\lib\s...
1,91321ce64e8646d7a8e28dac0a5d8deb,735982700926372623,FINISHED,file:///D:/protofolio_projects/Dev_skills_reco...,2023-08-17 09:25:36.843000+00:00,2023-08-17 09:25:36.877000+00:00,0.897505,0.404598,0.446315,0.531665,mahmo,LOCAL,Logistic-Regression_Original-Features_1.0.0,C:\Users\mahmo\anaconda3\envs\Rec-skills\lib\s...


### as we see from the above dataframe, original features outperform clustered features in all metricises especially recall & f1_score that we care more
#### 1. so will use the original features not all the clustered features
#### 2. And we will use These metricises as reference for any further complex model