In [11]:
import pickle 
import yaml
import pandas as pd
import mlflow
from mlflow import MlflowClient
import os

In [12]:
LOG_DATA_PKL='data.pkl'
LOG_MODEL_PKL='model.pkl'
LOG_METRICS_PKL='metrics.pkl'


mlflow_url='..\models\mlruns'
features_skills_clusters=r'..\data\processed\features_skills_clusters_description.yaml'
MLFLOW_RUN_ID='0f114561eea94a65b2b21f93b58af2d8'


In [13]:
class prediction:
    def __init__(self,track_url,run_id,feature_skills_clusters):
        self.track_url=track_url
        self.run_id=run_id

        self.feature_skills_clusters=feature_skills_clusters
        self.cluster_df=self.clusters()

        model_objs=self.load_mlflow_objs() 
        self.model=  model_objs[0]
        self.feature_names= model_objs[1]
        self.target_names=model_objs[2]


    def load_mlflow_objs(self):
        mlflow.set_tracking_uri(self.track_url)    
        client=MlflowClient()
        run=client.get_run(self.run_id)
        artifacts=run.info.artifact_uri
        artifacts=artifacts.replace("file:///c:/Users/Mohamed Mosaad/Downloads/first_git/data_science_project/notebooks/",'') 
        # print(artifacts)
        with open (os.path.join(artifacts,LOG_DATA_PKL),'rb')as aa:
            data_pkl=pickle.load(aa)
        # print (data_pkl)    
        with open (os.path.join(artifacts,LOG_MODEL_PKL),'rb') as bb:
            model_pkl=pickle.load(bb)

        # print(model_pkl)
        return model_pkl['model_object'],\
        data_pkl['features_names'],\
        data_pkl['targets_names']
    

    def clusters(self):
        with open (self.feature_skills_clusters,'r') as cc:
            clusters_with_skills=yaml.safe_load(cc)
        aa=[]
        bb=[]
        for cluster,skills in clusters_with_skills.items():
            # print(cluster,skills)
            for skill in skills:
                aa.append((skill,cluster))
        cluster_df=pd.DataFrame(aa,columns=["skills",'clusters'])
         # print(clusters_with_skills)    
        return cluster_df 
    
    def clusters_with_skills(self,input:list):
        all_features=self.feature_names
        # print(all_features)
        # print(len(all_features))

        new_cluster_df=self.cluster_df
        new_cluster_df["freq"]=new_cluster_df['skills'].isin(input)
        # print(new_cluster_df)
        # print(new_cluster_df.groupby('clusters')['freq'].sum())
        clusters=new_cluster_df.groupby('clusters')['freq'].sum()
        cluster_names=clusters.index


        all_features=pd.Series(all_features)
        skills_only=all_features[~all_features.isin(cluster_names)]
        # print(skills_only)
        ohe_skills=pd.Series(skills_only.isin(input).astype(int).to_list(),index=skills_only)
        # print(ohe_skills)

        prepared_features=pd.concat([ohe_skills,clusters])
        # print(prepared_features)
        # print(prepared_features.shape)

        return prepared_features
        # print(skills_only)
        
    def predict_jop(self,input):
        classifier=self.model
        prepared_features=[self.clusters_with_skills(input).values]
        # print(prepared_features)
        role_prediction=classifier.predict_proba(prepared_features)
        prediction=[prop[0][1] for prop in role_prediction ]
        prediction=pd.Series(prediction,index=self.target_names).sort_values(ascending=False)
        # print(prediction)
        return prediction
    
    def suggest_skills_for_role(self,skills,role,threshold=0):
        all_features=self.feature_names
        roles_for_skills=self.predict_jop(skills)
        cluster_names=self.cluster_df['clusters'].unique()
        all_features=pd.Series(all_features)
        skills_only=all_features[~all_features.isin(cluster_names)]
        # print(roles_for_skills["Data or business analyst"])
        skills_needed=[]
        for skill in skills_only:
            new_skills=skills.copy()
            new_skills.append(skill)
            diff=(self.predict_jop(new_skills)[role]-roles_for_skills[role])/roles_for_skills[role]
            if diff>threshold:
                skills_needed.append(skill)

        return skills_needed        
            

        

In [14]:
c=prediction(mlflow_url,MLFLOW_RUN_ID,features_skills_clusters)

In [15]:
c.load_mlflow_objs()[0]

In [16]:
c.clusters()

Unnamed: 0,skills,clusters
0,Julia,feature_0
1,Matlab,feature_0
2,Python,feature_0
3,R,feature_0
4,Django,feature_0
...,...,...
92,Crystal,feature_8
93,Delphi,feature_8
94,IBM DB2,feature_8
95,Svelte,feature_9


In [17]:
c.clusters_with_skills(['Python','Matlab'])

APL           0
Assembly      0
Bash/Shell    0
C             0
C#            0
             ..
feature_5     0
feature_6     0
feature_7     0
feature_8     0
feature_9     0
Length: 145, dtype: int64

In [18]:
c.predict_jop(['Python','SQL','Pandas','Keras','TensorFlow'])

Data scientist or machine learning specialist    0.640472
Data or business analyst                         0.407849
Scientist                                        0.225328
Engineer, data                                   0.091484
Developer, back-end                              0.048771
Developer, full-stack                            0.044791
Developer, desktop or enterprise applications    0.042422
Developer, embedded applications or devices      0.033921
Developer, QA or test                            0.031714
Developer, mobile                                0.022188
Database administrator                           0.019816
Engineer, site reliability                       0.012917
System administrator                             0.011892
Developer, front-end                             0.011267
Developer, game or graphics                      0.007090
DevOps specialist                                0.006164
dtype: float64

In [19]:
c.suggest_skills_for_role(['Python','Pandas','TensorFlow'],'Data scientist or machine learning specialist',.5)

['R', 'Scala', 'Flask', 'Hadoop', 'Keras', 'Torch/PyTorch', 'Atom']

In [20]:
a=[1,2,3,4]
b=1
a.append(b)
a
d=a.copy()
d.append(6)
d
a
d

[1, 2, 3, 4, 1, 6]