In [1]:
import csv
import os
import pandas as pd

# Client and email address dataframe

In [2]:
clientid_and_email = {'client_id': [1,2,3,4,5], 'email': ['A@example.com', 'B@example.com', 'C@example.com', 'D@example.com', 'E@example.com']}

client_id_and_email_df = pd.DataFrame.from_dict(clientid_and_email)
client_id_and_email_df

Unnamed: 0,client_id,email
0,1,A@example.com
1,2,B@example.com
2,3,C@example.com
3,4,D@example.com
4,5,E@example.com


# Client and project id dataframe

In [3]:
clientid_projectid_internal = [
    {'client_id':1, 'project_id':11, 'internal':'f'},
    {'client_id':1, 'project_id':22, 'internal':'f'},
    {'client_id':2, 'project_id':33, 'internal':'f'},
    {'client_id':2, 'project_id':44, 'internal':'f'},
    {'client_id':3, 'project_id':55, 'internal':'f'},
    {'client_id':4, 'project_id':66, 'internal':'t'},
    {'client_id':5, 'project_id':77, 'internal':'f'}
]

client_id_and_project_df = pd.DataFrame.from_dict(clientid_projectid_internal)
client_id_and_project_df

Unnamed: 0,client_id,internal,project_id
0,1,f,11
1,1,f,22
2,2,f,33
3,2,f,44
4,3,f,55
5,4,t,66
6,5,f,77


## Selecting only those with internal = f (eliminating non-clients)

In [4]:
client_only_id_and_project = client_id_and_project_df[client_id_and_project_df.internal != 't']
client_only_id_and_project

Unnamed: 0,client_id,internal,project_id
0,1,f,11
1,1,f,22
2,2,f,33
3,2,f,44
4,3,f,55
6,5,f,77


## Creating joint list of client only ids and project ids

In [5]:
project_and_email = pd.merge(client_only_id_and_project, client_id_and_email_df, how='inner', on='client_id')
project_and_email

Unnamed: 0,client_id,internal,project_id,email
0,1,f,11,A@example.com
1,1,f,22,A@example.com
2,2,f,33,B@example.com
3,2,f,44,B@example.com
4,3,f,55,C@example.com
5,5,f,77,E@example.com


# Component filters and query id dataframe

In [6]:
component_filter_with_queryid = [
    {'dashboard_id':111, 'project_id':11, 'query_id':1111, 'key':'location', 'value':'eu'},
    {'dashboard_id':111, 'project_id':11, 'query_id':1111, 'key':'location', 'value':'us'},
    {'dashboard_id':222, 'project_id':11, 'query_id':2222, 'key':'location', 'value':'eu'},
    {'dashboard_id':333, 'project_id':22, 'query_id':3333, 'key':'location', 'value':'us'},
    {'dashboard_id':444, 'project_id':22, 'query_id':3333, 'key':'location', 'value':'eu'},
    {'dashboard_id':444, 'project_id':22, 'query_id':4444, 'key':'location', 'value':'uk'},
    {'dashboard_id':555, 'project_id':33, 'query_id':5555, 'key':'location', 'value':'eu'},
    {'dashboard_id':666, 'project_id':33, 'query_id':5555, 'key':'location', 'value':'us'},
    {'dashboard_id':777, 'project_id':44, 'query_id':6666, 'key':'location', 'value':'eu'},
    {'dashboard_id':888, 'project_id':55, 'query_id':7777, 'key':'location', 'value':'us'},
    {'dashboard_id':999, 'project_id':66, 'query_id':8888, 'key':'location', 'value':'uk'}
]

component_filters_with_query_id_df = pd.DataFrame.from_dict(component_filter_with_queryid)
component_filters_with_query_id_df

Unnamed: 0,dashboard_id,key,project_id,query_id,value
0,111,location,11,1111,eu
1,111,location,11,1111,us
2,222,location,11,2222,eu
3,333,location,22,3333,us
4,444,location,22,3333,eu
5,444,location,22,4444,uk
6,555,location,33,5555,eu
7,666,location,33,5555,us
8,777,location,44,6666,eu
9,888,location,55,7777,us


## Renaming columns to avoid confusion

In [7]:
component_filters_with_query_id_df.rename(columns={'key': 'component_filter_type', 'value':'component_filter'}, inplace=True)
component_filters_with_query_id_df

Unnamed: 0,dashboard_id,component_filter_type,project_id,query_id,component_filter
0,111,location,11,1111,eu
1,111,location,11,1111,us
2,222,location,11,2222,eu
3,333,location,22,3333,us
4,444,location,22,3333,eu
5,444,location,22,4444,uk
6,555,location,33,5555,eu
7,666,location,33,5555,us
8,777,location,44,6666,eu
9,888,location,55,7777,us


# Dashboard filters and query id dataframe

In [8]:
dashboard_filter_with_queryid = [
    {'dashboard_id':111, 'project_id':11, 'query_id':1111, 'key':'location', 'value':'eu'}, #same filter at both dashboard and component levelt
    {'dashboard_id':111, 'project_id':11, 'query_id':2222, 'key':'location', 'value':'eu'}, #same dashboard as above - different query
    {'dashboard_id':101010, 'project_id':11, 'query_id':2222, 'key':'location', 'value':'uk'}, #new dashboard, same query as above
    {'dashboard_id':333, 'project_id':22, 'query_id':9999, 'key':'locationGroup', 'value':'n-a'},#dashboard has a component with a location filter and one without - filtering different queries 
    {'dashboard_id':444, 'project_id':22, 'query_id':4444, 'key':'location', 'value':'eu'}, #query 4444 is filtered to eu at dashboard level and to uk at component level
    {'dashboard_id':202020, 'project_id':33, 'query_id':10101010, 'key':'location', 'value':'uk'}, #new dashboard, new query, with uk filter
    {'dashboard_id':666, 'project_id':33, 'query_id':20202020, 'key':'locationGroup', 'value':'us'},#same dashboard and project, different query
    {'dashboard_id':303030, 'project_id':55, 'query_id':111111, 'key':'xlocationgroup', 'value':'eu'},#something completely new
    {'dashboard_id':999, 'project_id':77, 'query_id':8888, 'key':'location', 'value':'uk'},
    {'dashboard_id':111111, 'project_id':77, 'query_id':121212, 'key':'xlocation', 'value':'us'}#project was not mentioned in the components list
]

dashboard_filter_with_queryid_df = pd.DataFrame.from_dict(dashboard_filter_with_queryid)
dashboard_filter_with_queryid_df

Unnamed: 0,dashboard_id,key,project_id,query_id,value
0,111,location,11,1111,eu
1,111,location,11,2222,eu
2,101010,location,11,2222,uk
3,333,locationGroup,22,9999,n-a
4,444,location,22,4444,eu
5,202020,location,33,10101010,uk
6,666,locationGroup,33,20202020,us
7,303030,xlocationgroup,55,111111,eu
8,999,location,77,8888,uk
9,111111,xlocation,77,121212,us


## Renaming columns to avoid confusion

In [9]:
#rename
dashboard_filter_with_queryid_df.rename(columns={'key': 'dashboard_filter_type', 'value':'dashboard_filter'}, inplace=True)
dashboard_filter_with_queryid_df

Unnamed: 0,dashboard_id,dashboard_filter_type,project_id,query_id,dashboard_filter
0,111,location,11,1111,eu
1,111,location,11,2222,eu
2,101010,location,11,2222,uk
3,333,locationGroup,22,9999,n-a
4,444,location,22,4444,eu
5,202020,location,33,10101010,uk
6,666,locationGroup,33,20202020,us
7,303030,xlocationgroup,55,111111,eu
8,999,location,77,8888,uk
9,111111,xlocation,77,121212,us


# Merging dashboard and component filter data

In [10]:
component_dashboard_filters_df = pd.merge(component_filters_with_query_id_df, dashboard_filter_with_queryid_df, how='outer', on=['project_id', 'dashboard_id', 'query_id'])
component_dashboard_filters_df

Unnamed: 0,dashboard_id,component_filter_type,project_id,query_id,component_filter,dashboard_filter_type,dashboard_filter
0,111,location,11,1111,eu,location,eu
1,111,location,11,1111,us,location,eu
2,222,location,11,2222,eu,,
3,333,location,22,3333,us,,
4,444,location,22,3333,eu,,
5,444,location,22,4444,uk,location,eu
6,555,location,33,5555,eu,,
7,666,location,33,5555,us,,
8,777,location,44,6666,eu,,
9,888,location,55,7777,us,,


# Removing projects from non-clients (to get a more manageable dataset)

In [11]:
boolean_mask = component_dashboard_filters_df['project_id'].isin(project_and_email['project_id'])
client_only_components_and_dashboard = component_dashboard_filters_df[boolean_mask]
client_only_components_and_dashboard

Unnamed: 0,dashboard_id,component_filter_type,project_id,query_id,component_filter,dashboard_filter_type,dashboard_filter
0,111,location,11,1111,eu,location,eu
1,111,location,11,1111,us,location,eu
2,222,location,11,2222,eu,,
3,333,location,22,3333,us,,
4,444,location,22,3333,eu,,
5,444,location,22,4444,uk,location,eu
6,555,location,33,5555,eu,,
7,666,location,33,5555,us,,
8,777,location,44,6666,eu,,
9,888,location,55,7777,us,,


# Calculating occurences of each location code

## Number of components using each location filter

In [12]:
client_only_components_and_dashboard.component_filter.value_counts()

eu    5
us    4
uk    1
Name: component_filter, dtype: int64

In [19]:
grouped_by_components = client_only_components_and_dashboard.groupby("component_filter")["component_filter"].count()
grouped_components_sorted = grouped_by_components.sort_values(ascending = False)
grouped_components_sorted

component_filter
eu    5
us    4
uk    1
Name: component_filter, dtype: int64

## Number of dashboards using each location filter

In [14]:
client_only_components_and_dashboard.dashboard_filter.value_counts()

eu     5
uk     3
us     2
n-a    1
Name: dashboard_filter, dtype: int64

In [20]:
grouped_by_dashboards = client_only_components_and_dashboard.groupby("dashboard_filter")["dashboard_filter"].count()
grouped_dashboards_sorted = grouped_by_dashboards.sort_values(ascending = False)
grouped_dashboards_sorted

dashboard_filter
eu     5
uk     3
us     2
n-a    1
Name: dashboard_filter, dtype: int64

## Total number of dashboards+components using each location filter

In [21]:
components_and_dashboards_count = pd.concat([grouped_components_sorted, grouped_dashboards_sorted], axis = 1, sort= False).reset_index()
components_and_dashboards_count

Unnamed: 0,index,component_filter,dashboard_filter
0,eu,5.0,5
1,us,4.0,2
2,uk,1.0,3
3,n-a,,1


In [22]:
components_and_dashboards_count["total"] = (components_and_dashboards_count["component_filter"] + components_and_dashboards_count["dashboard_filter"]).fillna(components_and_dashboards_count[["component_filter", "dashboard_filter"]].sum(1))
components_and_dashboards_count

Unnamed: 0,index,component_filter,dashboard_filter,total
0,eu,5.0,5,10.0
1,us,4.0,2,6.0
2,uk,1.0,3,4.0
3,n-a,,1,1.0


# Number of times a location is used by each client

In [23]:
simpler_example = [
    {"client_id":"A",'project_id':"1", 'filter1':"uk", 'filter2':"", 'filter3':'uk'},
    {"client_id":"A",'project_id':"1", 'filter1':"", 'filter2':"fr", 'filter3':'fr'}, 
    {"client_id":"A",'project_id':"2", 'filter1':"uk", 'filter2':"", 'filter3':'uk'}, 
    {"client_id":"B",'project_id':"3", 'filter1':"uk", 'filter2':"fr", 'filter3':'fr'}, 
    {"client_id":"B",'project_id':"3", 'filter1':"", 'filter2':"", 'filter3':'fr'},
    {"client_id":"C",'project_id':"4", 'filter1':"", 'filter2':"", 'filter3':'de'}
]

simpler_example_df = pd.DataFrame.from_dict(simpler_example)

simpler_example_df
#uk is used 4 times in project 1


Unnamed: 0,client_id,filter1,filter2,filter3,project_id
0,A,uk,,uk,1
1,A,,fr,fr,1
2,A,uk,,uk,2
3,B,uk,fr,fr,3
4,B,,,fr,3
5,C,,,de,4


In [24]:
filter_1 = simpler_example_df.groupby("client_id")["filter1"].value_counts().unstack()
filter_1.transpose()

client_id,A,B,C
filter1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,1.0,1.0,1.0
uk,2.0,1.0,


In [25]:
filter_2 = simpler_example_df.groupby("client_id")["filter2"].value_counts().unstack()
filter_2.transpose()

client_id,A,B,C
filter2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,2.0,1.0,1.0
fr,1.0,1.0,


In [26]:
filter_3 = simpler_example_df.groupby("client_id")["filter3"].value_counts().unstack()
filter_3.transpose()

client_id,A,B,C
filter3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
de,,,1.0
fr,1.0,2.0,
uk,2.0,,


In [27]:
all_client_filters = filter_1.transpose().add(filter_2.transpose(), fill_value=0).add(filter_3.transpose(), fill_value=0)
all_client_filters.reset_index()
all_client_filters.index.names = ["location"]
all_client_filters

client_id,A,B,C
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,3.0,2.0,2.0
de,,,1.0
fr,2.0,3.0,
uk,4.0,1.0,


# Number of clients using each location

In [139]:
all_client_filters_mask = all_client_filters.mask(all_client_filters>0, other=1)
all_client_filters_mask

client_id,A,B,C
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,1.0,1.0,1.0
de,,,1.0
fr,1.0,1.0,
uk,1.0,1.0,


In [140]:
all_client_filters_mask.count(axis="columns")

location
      3
de    1
fr    2
uk    2
dtype: int64