In [1]:
## Init

from sklearn import tree
from sklearn.cross_validation import train_test_split

from objects import *
from settings import *


hr = HR(data)
df = hr.data



In [2]:
## Preprocessing.

satisfactions = lambda x: 0 if x <= 5 else 1 if x <= 8 else 2
hours = lambda x: 0 if x <= 160.5 else 1 if x <= 210.5 else 2 if x <= 240 else 3
projects = lambda x: 0 if x <= 3 else 1 if x <= 5 else 2
time_spent = lambda x: 0 if x <= 3 else 1 if x <= 5 else 2


def preprocess(df):
    """
    Turn the given dataframe's variables into numerical ones, able to be handled by the scikit-learn algorithms.
    :param df    The dataframe to process.
    :return      A dictionary
                    d = {
                        "df": Dataframe where categorical/ordinal variables are replaced by an integer mapping.,
                        "mappings": The mappings created by the preprocessing phase.
                    }
    """
    df_prime = df
    columns = df.columns
    
    features_types = {feature: set(map(type,set(df[feature]))) for feature in columns}
    str_features = list({feature: types for feature, types in features_types.items() if str in types})
    mappings = {}
    
    for str_feature in str_features:
        values = set(df_prime[str_feature].values)
        mapping = {value: key for value, key in zip(values,range(len(values)))}
        df_prime[str(str_feature)] = pd.Series(df_prime[str_feature]).map(mapping)
            
        mappings[str_feature] = mapping
    
    df_prime["satisfaction_level"] = pd.Series(df_prime["satisfaction_level"]).map(satisfactions)
    df_prime["average_montly_hours"] = pd.Series(df_prime["average_montly_hours"]).map(hours)
    df_prime["number_project"] = pd.Series(df_prime["number_project"]).map(projects)
    df_prime["time_spend_company"] = pd.Series(df_prime["time_spend_company"]).map(time_spent)
    
    return {"df": df_prime, "mappings": mappings}
    

In [42]:
## Tree drawing definitions.
import pydotplus 
from IPython.display import Image


def draw_tree(dot_data, pretty_prints):
    graph = pydotplus.graph_from_dot_data(dot_data)
    Image(graph.create_png())

In [43]:
## Compute decision trees.
from sklearn import metrics

import graphviz

import pickle


min_samples_leaf = 50
max_depths = range(2,6)

roots = labels
banned_features = set(["Work_accident", "sales", "idx"])
roots = set(roots) - banned_features
trees = {}

df["satisfaction_level"] = hr.discrete["satisfaction_level"]

df_prime = preprocess(df)["df"]
df_prime["last_evaluation"] = hr.discrete["last_evaluation"]

for max_depth in max_depths:
    trees[max_depth] = {}
    
    for metric in ["entropy", "gini"]:
        trees[max_depth][metric] = {}
        decision_tree = tree.DecisionTreeClassifier(criterion=metric,
                                                min_samples_leaf=min_samples_leaf,
                                                max_depth=max_depth)

        for root in roots:
            columns = list(roots)
            columns.remove(root)

            train_data, test_data, train_target, test_target = train_test_split(df_prime[columns].values,
                                                                                df_prime[root].values,
                                                                                test_size=.2,
                                                                                random_state=0)
            
            trees[max_depth][metric][root] = {}
            
            # Training
            trees[max_depth][metric][root]["train"] = {}
            trained_model = decision_tree.fit(train_data, train_target)
            validation_on_training_set = decision_tree.predict(train_data)
                                              
            trees[max_depth][metric][root]["train"]["tree"] = (trained_model, list(roots))
            
            # Training measures
            trees[max_depth][metric][root]["train"]["precision"] = metrics.precision_score(train_target,
                                                                    validation_on_training_set,
                                                                    average="weighted")
            trees[max_depth][metric][root]["train"]["recall"] = metrics.recall_score(train_target,
                                                                 validation_on_training_set,
                                                                 average="weighted")
            trees[max_depth][metric][root]["train"]["f1"] = metrics.f1_score(train_target,
                                                             validation_on_training_set,
                                                             average="weighted")
            trees[max_depth][metric][root]["train"]["accuracy"] = metrics.accuracy_score(train_target, validation_on_training_set)
            trees[max_depth][metric][root]["train"]["support"] = metrics.precision_recall_fscore_support(train_target, validation_on_training_set)
            
            
            # Validation
            trees[max_depth][metric][root]["test"] = {}
            test_on_training_set = decision_tree.predict(test_data)
            
            # Validation measures
            trees[max_depth][metric][root]["test"]["accuracy"] = metrics.accuracy_score(test_target, test_on_training_set)
            trees[max_depth][metric][root]["test"]["confusion matrix"] = metrics.confusion_matrix(test_target, test_on_training_set)
            
            # Export
            tree.export_graphviz(decision_tree,
                                 out_file="tree." + str(max_depth)
                                             + "." + str(metric)
                                             + "." + str(labels_pretty_print[root])
                                             + ".dot",
                                 feature_names=columns,
                                 leaves_parallel=True,
                                 proportion=True,
                                 rounded=True)


with open("trees.pickle", "wb") as log:
    pickle.dump(file=log, obj=trees, protocol=pickle.HIGHEST_PROTOCOL)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
