# **Cohort Analysis**

In [None]:
## Importing libraries
from pyspark.sql.functions import rank, dense_rank, desc,col, when, max, countDistinct, udf
from pyspark.sql import Window
from pyspark.sql.types import StringType
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris #, load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import _tree

In [None]:
# List all the HR attributes that you want to use in creating cohorts
# Make sure to add isInfluencer as well
all_attr = ["Organization","FunctionType","Layer","LevelDesignation","Region","SupervisorIndicator","Influence_rank"]
#List  categorical attributes from the attributes selected above
categorical_attributes = ["Organization","FunctionType","Layer","LevelDesignation","Region","SupervisorIndicator"]
#List  none categorical attributes from the attributes selected above
non_categorical_attributes = []

## **Step 1:** read the input file
#Note: Modify this cell to read the input data: csv from a local path, Azure blob storage with access key, etc

In [None]:
# This method is reading the input file from Synapse linked Storage account
inputFilePath = "abfss://{}@{}.dfs.core.windows.net/{}/*".format("cohortanalysis","mgdcvivadatalake","influenceQuery")
df = spark.read.format("csv").option("header","True").load(inputFilePath)
# display(df)



In [None]:
# # Config and settings
Directed = True
Reversed = False
InteractionType = "all"
depth = 4
min_sample_leaf_size = 30
percentSelection = 0.3

## **Step 2:** Add isInfluencer flag

In [None]:
selection = df.count()*percentSelection
df = df.withColumn("isInfluencer", when(col("Influence_rank")<=selection, 1).otherwise(0))

numOfChampions = df.filter(col("isInfluencer")==1).select("PersonId").distinct().count()
populationSize = df.select("PersonId").distinct().count()


In [None]:
# Casting the dataframe to Pandas dataframe in order to use python code
pd_df= df.toPandas()
display(df)

# **Step 3: ** Training a DecisionTree model

In [None]:
# making a one hot encoding out of the categorical features that we want to include in cohort creation

one_hot_data = pd.get_dummies(pd_df[categorical_attributes], drop_first=False, prefix_sep='=')



all_data = pd.concat([pd_df[non_categorical_attributes], one_hot_data], axis=1)


# Training DT model

model = DecisionTreeClassifier(random_state=42, max_depth=depth, criterion='entropy', min_samples_leaf=min_sample_leaf_size)

model.fit(all_data, pd_df[["isInfluencer"]])

cohorts = pd.DataFrame()




# **Step 4:**

## DecisionTree text representation

In [None]:
from sklearn import tree
text_representation = tree.export_text(model)
print(text_representation)

In [None]:
col_names

## Printing Cohorts and the tree

In [None]:
# Extract rules

def tree_to_code(tree, feature_names):

    tree_ = tree.tree_

    feature_name = [

        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"

        for i in tree_.feature

    ]

#     print( "def tree({}):".format(", ".join(feature_names)))

 

    def recurse(node, depth, stack, l):

        indent = "  " * depth

        if tree_.feature[node] != _tree.TREE_UNDEFINED:

            name = feature_name[node]
            name_1 = feature_name[node].split("=")[0]
            name_2 = feature_name[node].split("=")[1]

            threshold = tree_.threshold[node]
            stack.append(name_1 + "=not_" + name_2)

            recurse(tree_.children_left[node], depth + 1,stack, l)

            stack.append(name)

            recurse(tree_.children_right[node], depth + 1, stack, l)

        else:
            d={}
            for feature in stack:
              if feature.split("=")[0] in d:
                if d[feature.split("=")[0]][4:]==str(feature.split("=")[1]):
                  d[feature.split("=")[0]]= str(feature.split("=")[1])
                else:
                  d[feature.split("=")[0]]= str(d[feature.split("=")[0]])+"_AND_"+str(feature.split("=")[1])
              else:
                d[feature.split("=")[0]]= str(feature.split("=")[1])
              d["Number of Influencers"] = tree_.value[node][0][1]
              d["Number of non-Influencers"]=tree_.value[node][0][0]
            l.append(pd.Series(d))
            stack.pop()
            # print( "{}return {} Percentage of champions in group: {}, Percentage of all champions: {}".format(indent, tree_.value[node],round((tree_.value[node][0][1])/(tree_.value[node][0][0]+tree_.value[node][0][1]),2),round(tree_.value[node][0][1]/numOfChampions,2)))

 
    l = []
    recurse(0, 1,[],l) 
    return pd.DataFrame(l)

def tree_to_pseudo(tree, feature_names):

 

              '''

              Outputs a decision tree model as if/then pseudocode

             

              Parameters:

              -----------

              tree: decision tree model

                           The decision tree to represent as pseudocode

              feature_names: list

                           The feature names of the dataset used for building the decision tree

              '''

 

              left = tree.tree_.children_left

              right = tree.tree_.children_right

              threshold = tree.tree_.threshold

              features = [feature_names[i] for i in tree.tree_.feature]

              value = tree.tree_.value



    

    



# Different output format

# python inden

col_names = [str(x) for x in list(all_data.columns.values)]

all_data.columns = col_names

cohorts = tree_to_code(model, col_names).fillna("")
cohorts["cohort size"] = round(cohorts["Number of Influencers"]+cohorts["Number of non-Influencers"],3)
cohorts["% covered Influencers"] = round(cohorts["Number of Influencers"]/numOfChampions,3)
cohorts["% Influencers in cohorts"] = round(cohorts["Number of Influencers"]/cohorts["cohort size"],3)
cohorts["% population"] = round(cohorts["cohort size"]/populationSize,3)

tree_to_pseudo(model, col_names)

display(cohorts)

# from sklearn import tree

# # dotfile = open("dt.dot", 'w')

# # tree.export_graphviz(model, out_file=dotfile, feature_names=raw_data.feature_names)

# dotfile.close() 

In [None]:
from matplotlib import pyplot as plt
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(model, 
                   feature_names=col_names,  
                   class_names="isInfluencer",
                   filled=True)

In [None]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(model, feature_names=col_names, filled=True)