In [1]:
DF_PATH       = "/content/drive/MyDrive/Colab Notebooks/datasets/2_cleaned_data.pkl"

ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageHaveWorkedWith',
                  'DatabaseHaveWorkedWith',
                  'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith',
                  'ToolsTechHaveWorkedWith',
                  'NEWCollabToolsHaveWorkedWith',
                  ]


In [2]:
# Load packages
import pandas as pd
import numpy as np
import logging
import pickle
import random
import plotly
import os
from pathlib import Path

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.decomposition import PCA, KernelPCA

from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from matplotlib import pyplot as plt

# Functions


In [3]:
def calculate_quality(ground_truth, predictions, metric_function, sort_values=False):
    quality_scores = {}
    for col in predictions.columns:
        role_pred  = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)

    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()

    return quality_scores

# Prepare data

In [4]:
# Read Data
df = pd.read_pickle(DF_PATH)

In [5]:
# Check the total samples of roles
roles_df = df["DevType"].copy()
roles_df.sum(axis=0)

Unnamed: 0,0
Academic researcher,1708
Data or business analyst,1658
Data scientist or machine learning specialist,2460
Database administrator,1210
DevOps specialist,3056
"Developer, QA or test",1135
"Developer, back-end",17084
"Developer, desktop or enterprise applications",4845
"Developer, embedded applications or devices",2138
"Developer, front-end",8932


In [6]:
# Resample roles
samples_per_class = 1200
resampled_roles = []

for role_col in roles_df.columns:
    sub_df = roles_df.loc[roles_df[role_col] == 1].copy()

    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=0)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=0)

    resampled_roles.append(sub_df)

In [7]:
# Construct dfs
roles_df = pd.concat(resampled_roles)
df = df.loc[roles_df.index].copy()

In [8]:
roles_df.sum(axis=0)

Unnamed: 0,0
Academic researcher,2280
Data or business analyst,1965
Data scientist or machine learning specialist,2576
Database administrator,1765
DevOps specialist,2170
"Developer, QA or test",1514
"Developer, back-end",5710
"Developer, desktop or enterprise applications",2690
"Developer, embedded applications or devices",1773
"Developer, front-end",2614


# Split to train and test

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(df.drop('DevType', axis=1),
                                                    df['DevType'],
                                                    random_state=0)

  X_train, X_test, Y_train, Y_test = train_test_split(df.drop('DevType', axis=1),


# Train models

In [10]:
rf_clf = make_pipeline(RobustScaler(),
                       PCA(n_components=0.95),
                       RandomForestClassifier(n_jobs=8,
                                              verbose=1,
                                              random_state=0))

rf_clf.fit(X_train.values, Y_train.values)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   25.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:   53.4s finished


In [11]:
# Evaluate on train set
predictions =  pd.DataFrame(rf_clf.predict(X_train.values),
                            columns=Y_train.columns)
train_scores = {score.__name__: calculate_quality(Y_train, predictions, score)
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.6s finished


In [12]:
# Evaluate on test set
predictions =  pd.DataFrame(rf_clf.predict(X_test.values),
                            columns=Y_test.columns)
test_scores = {score.__name__: calculate_quality(Y_test, predictions, score)
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.3s finished


In [13]:
print(test_scores.mean())
test_scores.sort_values("precision_score")

accuracy_score     92.591875
precision_score    94.102500
recall_score       49.286875
f1_score           63.683750
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, full-stack",83.77,79.85,59.23,68.01
"Developer, back-end",79.15,84.04,37.4,51.76
"Developer, front-end",89.73,88.67,32.85,47.94
Data scientist or machine learning specialist,93.6,91.48,58.02,71.01
"Developer, desktop or enterprise applications",90.77,93.86,33.28,49.14
Academic researcher,94.9,94.05,59.71,73.05
"Developer, embedded applications or devices",94.6,94.35,40.14,56.32
"Developer, mobile",93.83,94.74,47.19,63.0
Scientist,97.15,96.13,75.35,84.48
DevOps specialist,92.77,96.98,36.14,52.66


# Hyper parameter tuning

In [15]:
hpt_rf_clf = make_pipeline(RobustScaler(),
                           PCA(),
                           RandomForestClassifier(n_jobs =8,
                                                  verbose=1,
                                                  random_state=0))

In [16]:
tuned_parameters = [{
    'pca__n_components': [0.7, 0.85, 0.95],
    'randomforestclassifier__n_estimators': [250, 500],
    'randomforestclassifier__max_depth':    [3, 10, None],
}]

In [17]:
hpt_rf_clf = GridSearchCV(hpt_rf_clf, tuned_parameters)
hpt_rf_clf.fit(X_train.values, Y_train.values)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    9.3s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:   11.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    7.6s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    9.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      |

In [18]:
hpt_rf_clf.best_params_

{'pca__n_components': 0.7,
 'randomforestclassifier__max_depth': None,
 'randomforestclassifier__n_estimators': 500}

In [19]:
# Evaluate on test set
predictions =  pd.DataFrame(hpt_rf_clf.predict(X_train.values),
                            columns=Y_train.columns)
train_scores = {score.__name__: calculate_quality(Y_train, predictions, score)
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    2.4s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    2.7s finished


In [20]:
print(test_scores.mean())
test_scores

accuracy_score     92.591875
precision_score    94.102500
recall_score       49.286875
f1_score           63.683750
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,94.9,94.05,59.71,73.05
Data or business analyst,94.42,97.02,46.63,62.98
Data scientist or machine learning specialist,93.6,91.48,58.02,71.01
Database administrator,94.94,98.07,45.93,62.56
DevOps specialist,92.77,96.98,36.14,52.66
"Developer, QA or test",96.54,99.57,58.23,73.48
"Developer, back-end",79.15,84.04,37.4,51.76
"Developer, desktop or enterprise applications",90.77,93.86,33.28,49.14
"Developer, embedded applications or devices",94.6,94.35,40.14,56.32
"Developer, front-end",89.73,88.67,32.85,47.94


In [21]:
LOG_PATH = "/content/drive/MyDrive/Colab Notebooks/datasets"
LOG_MODEL_PKL   =  "model.pkl"
LOG_DATA_PKL    =  "data.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

In [22]:
# Model
model = {"model_description": "Random Forest: with PCA + Hyperparamter tuning",
         "model_details": str(hpt_rf_clf),
         "model_object": hpt_rf_clf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [23]:
# Data details
data_details = {"data_path": DF_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices":     X_test.index.tolist(),
                "features_names":   X_train.columns.droplevel(0).tolist(),
                "targets_names":    Y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [24]:
# Preformance details
classes_metrics = {"train_scores": train_scores,
                   "test_scores":  test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)