In [386]:
#constants

DATA_PATH = 'C:\\Users\\gt\\End to End Data Science Project\\Processed data\\cleaned_data.pkl'


TRACKING_URI = "C:\\Users\\gt\\End to End Data Science Project\\mlruns\\"
EXPERIMENT_ID = "2"
RUN_ID = "2d620efffdac4b9db4adad274b4a7ff9"

MLFLOW_EXPERIMENT_NAME = "skills_jobs_Stackoverflow"

LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"


In [387]:
import os 
import sklearn
import pickle
import yaml

import pandas as pd

import mlflow
from mlflow.tracking import MlflowClient

## Initialize

### Load model


In [388]:
artifact_path = os.path.join(TRACKING_URI, 
                             EXPERIMENT_ID, 
                             RUN_ID, 
                             'artifacts')

In [389]:
# Load data pkl
data_path  = os.path.join(artifact_path, LOG_DATA_PKL)
with open(data_path, 'rb') as handle:
    data_pkl = pickle.load(handle)

In [390]:
# Load model pkl
model_path = os.path.join(artifact_path, LOG_MODEL_PKL)
with open(model_path, 'rb') as handle:
    model_pkl = pickle.load(handle)

model = model_pkl["model_object"]
model

Pipeline(steps=[('robustscaler', RobustScaler()),
                ('pca', PCA(n_components=0.95)),
                ('randomforestclassifier',
                 RandomForestClassifier(n_jobs=8, random_state=0, verbose=1))])

## Predict sample entry


In [391]:
CLUSTERS_YAML_PATH = 'C:\\Users\\gt\\End to End Data Science Project\\Processed data\\features_skills_clusters_description.yaml'

In [392]:
with open(CLUSTERS_YAML_PATH, "r") as stream:
    clusters_config = yaml.safe_load(stream)

In [393]:
molten_clusters = [(cluster_name, cluster_skill)
                   for cluster_name, cluster_skills in clusters_config.items()
                   for cluster_skill in cluster_skills]

clusters_df = pd.DataFrame(molten_clusters, columns=["cluster_name", "skill"])
clusters_df

Unnamed: 0,cluster_name,skill
0,skills_group_0,Assembly
1,skills_group_0,C
2,skills_group_0,C++
3,skills_group_0,Julia
4,skills_group_0,Matlab
...,...,...
120,skills_group_9,Elasticsearch
121,skills_group_9,Redis
122,skills_group_9,DigitalOcean
123,skills_group_9,Apache Spark


## Recreate cluster features


In [394]:
sample_skills = ['Pandas', 'TensorFlow', 'Torch/PyTorch', 'Python', 'Keras']


In [407]:
# Verify that these skills are io our dataset
pd.Series(sample_skills).isin(features_names)

0    True
1    True
2    True
3    True
4    True
dtype: bool

In [395]:
sample_clusters = clusters_df.copy()


In [396]:
sample_clusters["sample_skills"] = sample_clusters["skill"].isin(sample_skills)


In [397]:
cluster_features = sample_clusters.groupby("cluster_name")["sample_skills"].sum()
cluster_features


cluster_name
skills_group_0     5
skills_group_1     0
skills_group_10    0
skills_group_11    0
skills_group_2     0
skills_group_3     0
skills_group_4     0
skills_group_5     0
skills_group_6     0
skills_group_7     0
skills_group_8     0
skills_group_9     0
Name: sample_skills, dtype: int64

## Create OneHotEncoded skills


In [398]:
features_names = pd.Series(data_pkl["features_names"])
features_names

0                 APL
1            Assembly
2          Bash/Shell
3                   C
4                  C#
            ...      
132    skills_group_5
133    skills_group_6
134    skills_group_7
135    skills_group_8
136    skills_group_9
Length: 137, dtype: object

In [399]:
skills_names = features_names[~features_names.isin(cluster_features.index)]
skills_names

0                     APL
1                Assembly
2              Bash/Shell
3                       C
4                      C#
              ...        
120                   Vim
121         Visual Studio
122    Visual Studio Code
123              Webstorm
124                 Xcode
Length: 125, dtype: object

In [400]:
sample_skills

['Pandas', 'TensorFlow', 'Torch/PyTorch', 'Python', 'Keras']

In [401]:
ohe_skills = pd.Series(skills_names.isin(sample_skills).astype(int).tolist(), 
                       index=skills_names)
ohe_skills

APL                   0
Assembly              0
Bash/Shell            0
C                     0
C#                    0
                     ..
Vim                   0
Visual Studio         0
Visual Studio Code    0
Webstorm              0
Xcode                 0
Length: 125, dtype: int64

In [402]:
cluster_features.index

Index(['skills_group_0', 'skills_group_1', 'skills_group_10',
       'skills_group_11', 'skills_group_2', 'skills_group_3', 'skills_group_4',
       'skills_group_5', 'skills_group_6', 'skills_group_7', 'skills_group_8',
       'skills_group_9'],
      dtype='object', name='cluster_name')

## Combine features


In [403]:
features = pd.concat([ohe_skills, cluster_features])
[features]

[APL               0
 Assembly          0
 Bash/Shell        0
 C                 0
 C#                0
                  ..
 skills_group_5    0
 skills_group_6    0
 skills_group_7    0
 skills_group_8    0
 skills_group_9    0
 Length: 137, dtype: int64]

In [404]:
features = features[data_pkl["features_names"]]
features

APL               0
Assembly          0
Bash/Shell        0
C                 0
C#                0
                 ..
skills_group_5    0
skills_group_6    0
skills_group_7    0
skills_group_8    0
skills_group_9    0
Length: 137, dtype: int64

## Predict

In [405]:
predictions = model.predict_proba([features.values])
predictions

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


[array([[0.62, 0.38]]),
 array([[0.87, 0.13]]),
 array([[0.41, 0.59]]),
 array([[0.97, 0.03]]),
 array([[1., 0.]]),
 array([[0.96, 0.04]]),
 array([[0.9, 0.1]]),
 array([[0.95, 0.05]]),
 array([[0.95, 0.05]]),
 array([[0.98, 0.02]]),
 array([[0.97, 0.03]]),
 array([[0.99, 0.01]]),
 array([[0.99, 0.01]]),
 array([[0.87, 0.13]]),
 array([[0.77, 0.23]]),
 array([[1., 0.]])]

In [406]:
positive_probs = [prob[0][1] for prob in predictions]
pd.Series(positive_probs, 
          index=data_pkl["targets_names"]).sort_values(ascending=False)

Data scientist or machine learning specialist    0.59
Academic researcher                              0.38
Scientist                                        0.23
Data or business analyst                         0.13
Engineer, data                                   0.13
Developer, back-end                              0.10
Developer, desktop or enterprise applications    0.05
Developer, embedded applications or devices      0.05
Developer, QA or test                            0.04
Database administrator                           0.03
Developer, full-stack                            0.03
Developer, front-end                             0.02
Developer, game or graphics                      0.01
Developer, mobile                                0.01
DevOps specialist                                0.00
System administrator                             0.00
dtype: float64