In [1]:
import os
os.chdir('../..')
from pipelines.util import *
import pandas as pd
from datetime import datetime
from collections import Counter
import re

In [2]:
data = pd.read_csv(os.path.join(WDIR, 'true-north/true_north_clean.csv'))
data.drop(columns='associated Company Domain (Contact Level)', inplace=True)
# data[data['Industry'].isna()]

In [3]:
# create empty dataframe with the same columns as `data`
summary = pd.DataFrame(columns=data.columns.to_list())
summary.loc['count'] = data.count()
mode = data.mode().head(1)
mode.reset_index(drop=True, inplace=True)
mode.rename(index={0: 'top'}, inplace=True)
summary = pd.concat([summary, mode])
freq = data.apply(lambda x: x.value_counts(dropna=True).max())
summary.loc['freq'] = freq

unique = data.nunique(dropna=True)
summary.loc['unique'] = unique

summary

Unnamed: 0,last_updated,Create Date,Do you feel the True North report identified the key challenges and opportunities facing the region?,Are you interested in attending future True North events?,Are you currently a B Corp or in the process of becoming a B Corp?,Would you be interested in hearing more from Brabners about the B Corp process?,Company Name,City,sector,location,Which theme of the True North report do you most identify with and could support activity around?,How would you like to be involved with the True North network?,company_size
count,357,357,262,262,261,260,356,64,351,349,263,137,349
top,2023-05-01 00:00:00,2023-05-01 00:00:00,Yes,Yes,No,No,The Data City,Manchester,Professional Services,Greater Manchester,"People, skills and the future",Keeping informed about the latest True North n...,0-9
freq,14,78,255,259,225,156,3,16,35,98,76,47,127
unique,315,259,2,2,2,2,329,25,179,41,16,8,5


In [4]:
# summary = data.describe()
summary.loc['top_percent_of_count'] = (summary.loc['freq'] * 100 / summary.loc['count']).astype(float).round(1)

In [5]:
summary

Unnamed: 0,last_updated,Create Date,Do you feel the True North report identified the key challenges and opportunities facing the region?,Are you interested in attending future True North events?,Are you currently a B Corp or in the process of becoming a B Corp?,Would you be interested in hearing more from Brabners about the B Corp process?,Company Name,City,sector,location,Which theme of the True North report do you most identify with and could support activity around?,How would you like to be involved with the True North network?,company_size
count,357,357,262,262,261,260,356,64,351,349,263,137,349
top,2023-05-01 00:00:00,2023-05-01 00:00:00,Yes,Yes,No,No,The Data City,Manchester,Professional Services,Greater Manchester,"People, skills and the future",Keeping informed about the latest True North n...,0-9
freq,14,78,255,259,225,156,3,16,35,98,76,47,127
unique,315,259,2,2,2,2,329,25,179,41,16,8,5
top_percent_of_count,3.9,21.8,97.3,98.9,86.2,60.0,0.8,25.0,10.0,28.1,28.9,34.3,36.4


In [6]:
company_names = pd.Series(data['Company Name'].sort_values())
print(company_names.unique())

['2030hub' 'ACAS' 'AHR Architects' 'AMRC' 'Abeceder ltd'
 'Access Creative College' 'Accountable Recruitment Limited'
 'Active Profile' 'AdaptiveComms' 'Advanced Manufacturing Research Centre'
 'Agent' 'Agile Automations' 'Ahead Partnership' 'Alvarez and Marsal'
 'Andrea Nixon Consulting' 'Anticus Partners Limited' 'Arcadis'
 'Armstrong Watson' 'Atkin Jones Limited' 'Audacia' 'Avison Young'
 'Axon Moore' 'Azets' 'B engineering group' 'BHP LLP' 'BOW' 'BPP'
 'BPP Group' 'BWB' 'Bank of England' 'Barton Legal Limited'
 'Big Hand 4 Business' 'Blackpool Council' 'Boo Coaching & Consulting'
 'Booking.com (UK HQ in Manchester)'
 'Bradford District Place Making & Investment Partnership and other non-exec roles'
 'Brandari Limited' 'Breeze Development' 'British Business Bank'
 'Brompton Bicycle Ltd' 'Brown & Co Accountants Limited'
 'Business Doctors' 'Business Health Institute' 'Business of Science Ltd'
 'CBI' 'CBRE' 'CH Marketing & Property' 'Cabron Happy World'
 'Careers and Enterprise Compan

There is LLoyds banking group and lloyds bank plc - we're assuming these are different companies.

In [7]:
# convert the month column to a datetime object
data['month'] = pd.to_datetime(data['Create Date'])

#convert the item to a formatted value in yyyy-mm format.
data['month_formatted'] = data['month'].apply(datetime.strftime, format='%Y-%m').sort_index(ascending=True)

In [8]:
# take the number of members and calculate the number that joined each month.
monthly_members = pd.DataFrame(data['month_formatted'].value_counts(ascending=False)).reset_index()
monthly_members['start_of_month'] = monthly_members['month_formatted'].astype(str) + '-01'

In [9]:
monthly_orgs = data.drop_duplicates(subset='Company Name', keep='last')
monthly_orgs = pd.DataFrame(monthly_orgs['month_formatted'].value_counts(ascending=False)).reset_index()
monthly_orgs['start_of_month'] = monthly_orgs['month_formatted'].astype(str) + '-01'

In [10]:
def decimal_date(data):
    # make a unix timestamp column
    data['timestamp'] = pd.to_datetime(data['month_formatted'], format='%Y-%m').astype(int) / 10**9
    # make a decimal date and round to 2dp.
    data['year'] = data['timestamp'].div((86400*365.25)).add(1970).round(2)
    # drop the timestamp column
    data.drop(columns='timestamp', inplace=True) 
    # set year and formatted month as the index so they aren't included in the cumsum.
    data.set_index(['year', 'month_formatted', 'start_of_month'], inplace=True, append=True)

    return data

In [11]:
def calculate_cumsum(data, count_name):
    # order by date, then do the cumsum. reset the index, drop the original index column as not needed
    data = pd.DataFrame(data.sort_index(level=2).cumsum(skipna=True).reset_index().drop(columns='level_0'))
    data.rename(columns={'count': f'{count_name}'}, inplace=True)
    return data

In [12]:
# apply above functions to data
cs_monthly_members = calculate_cumsum(decimal_date(monthly_members), count_name='individuals')
cs_monthly_orgs = calculate_cumsum(decimal_date(monthly_orgs), count_name='orgs')

In [13]:
# merged the two dataframes
cs_merged = cs_monthly_members.merge(cs_monthly_orgs, how='inner', on=['year', 'month_formatted', 'start_of_month'])

In [14]:
# write to file
cs_merged.to_csv(os.path.join(SRC_DIR,'overview/membership/_data/cumsum.csv'), index=False)

Make a bar chart for locations.

In [15]:
locations = data.copy()
split_locs = locations['location'].str.split(';').apply(pd.Series, 1).stack()
split_locs.index = split_locs.index.droplevel(-1)
split_locs.name = 'location'
del locations['location']
locations = locations.join(split_locs)
locations['location'] = locations['location'].str.lstrip()
top_locations = locations['location'].value_counts().reset_index().head(5)
top_locations.to_csv(os.path.join(SRC_DIR, 'overview/membership/_data/top_locations.csv'), index=False)

  split_locs = locations['location'].str.split(';').apply(pd.Series, 1).stack()


Calculating some summary stats

In [16]:
total_members = len(data.index)

total_companies = summary.loc['unique', 'Company Name']

top_company_size = summary.loc['top', 'company_size']

top_company_size_pct = summary.loc['top_percent_of_count', 'company_size']

# top_industry = summary.loc['top', 'Industry']
top_industry = ''
top_industry_pct = 0
# top_industry_pct = summary.loc['top_percent_of_count', 'Industry']

In [17]:
data['City']

0             NaN
1             NaN
2             NaN
3             NaN
4      Manchester
          ...    
352           NaN
353           NaN
354           NaN
355    Merseyside
356           NaN
Name: City, Length: 357, dtype: object

### Word frequency in the sector column

In [18]:
def normalize_string(s):
    # Convert to lowercase and remove non-alphanumeric characters (keeping spaces)
    try:
        s = s.lower()
    except:
        print(f"'{s}' is not a stirng type. Converting to string\n")
        return str(s)
    s = re.sub(r'[^a-z0-9\s]', '', s)
    return s
data['normalized_sector'] = data['sector'].apply(normalize_string).str.split(';')
 
# Flatten the list and further split by spaces to handle multi-word strings
all_words = [word for sublist in data['normalized_sector'] for item in sublist for word in item.split()]
 
# Count the occurrences of each word
word_counts = Counter(all_words)
 
# Find the most common word
most_common_words = word_counts.most_common(10)
least_common_words = word_counts.most_common()

banned_words = ['and', 'or', 'of', 'it', 'the', 'for', 'with', 'we', 'nan']
# print("The 10 most common words, not including 'and' are:") 
# for word, count in most_common_words:
#     if word == banned_words:
#         continue
#     print(f"'{word}' with {count} occurences")

print("The most common words, with at least 2 occurences are:")
words = []
counts = []
for word, count in least_common_words:
    if word in banned_words:
        continue
    if word == 'estate':
        continue
    if count >= 10:
        if word == 'real':
            words.append('real estate')
        else:
            words.append(word)
        counts.append(count)
        print(f"'{word}' with {count} occurences")

sector_counts = pd.DataFrame(data={'name': words, 'count': counts}).set_index('name')
# sector_counts['colour'] = round((sector_counts['count'] - min(sector_counts['count'])) / sector_counts['count'].max(), 3)
sector_counts.to_csv(os.path.join(SRC_DIR, 'overview/membership/_data/sector_word_counts.csv'))
# sector_strings = [item for sublist in data['sector'].str.split(';') for item in sublist]
# string_counts = Counter(sector_strings)
# print(string_counts.most_common())


'nan' is not a stirng type. Converting to string

'nan' is not a stirng type. Converting to string

'nan' is not a stirng type. Converting to string

'nan' is not a stirng type. Converting to string

'nan' is not a stirng type. Converting to string

'nan' is not a stirng type. Converting to string

The most common words, with at least 2 occurences are:
'services' with 62 occurences
'professional' with 54 occurences
'technology' with 40 occurences
'finance' with 35 occurences
'education' with 32 occurences
'admin' with 31 occurences
'business' with 29 occurences
'insurance' with 29 occurences
'construction' with 29 occurences
'notforprofit' with 28 occurences
'support' with 26 occurences
'development' with 26 occurences
'marketing' with 26 occurences
'communications' with 25 occurences
'manufacturing' with 25 occurences
'software' with 22 occurences
'engineering' with 22 occurences
'recruitment' with 18 occurences
'real' with 18 occurences
'legal' with 16 occurences
'information' with 1

In [19]:
advisory_council = 11

if summary.loc['top', "Are you currently a B Corp or in the process of becoming a B Corp?"] == 'No':
    b_corps_pct = round(100 - summary.loc['top_percent_of_count', "Are you currently a B Corp or in the process of becoming a B Corp?"], 1)
else:
    summary.loc['top_percent_of_count', "Are you currently a B Corp or in the process of becoming a B Corp?"]

membership_increase = cs_merged['individuals'].pct_change().mul(100).iloc[-1].round(1)

northern_stars = len(pd.read_csv(os.path.join(SRC_DIR, 'overview/northern-stars/_data/northern_stars.csv')))

names = ["Total members", 
         "Total companies", 
         "Membership increase", 
         f"Companies with {top_company_size} employees", 
         "Top industry", 
         "Geographic reach", 
         "Advisory council", 
         "Northern stars", 
         "B Corps"]

values = [total_members, 
          total_companies, 
          membership_increase, 
          top_company_size_pct, 
          top_industry_pct, 
          "4", 
          advisory_council, 
          northern_stars, 
          b_corps_pct]

footnotes = ["People", 
             "Unique companies", 
             "Since last month", 
             "Of members work in companies of this size", 
             f"Of members work in {top_industry}", 
             "placeholder", 
             "Members represent the network on the True North advisory council", 
             "Companies have been featured as Northern Stars", 
             "Of member's organisations are B Corps or are joining"]

posts = ['','','%','%','%','','','','%']

urls = ['/overview/membership', '/overview/membership', '/overview/membership', '', '', '', 'https://www.brabners.com/insights/true-north/true-north-advisory-council-launches', '/overview/northern-stars', '/themes/sustainable-growth/b-corporations']
dashboard = pd.DataFrame(data={'name':names, 'value': values, 'footnote': footnotes, 'post': posts, 'url': urls})

dashboard

Unnamed: 0,name,value,footnote,post,url
0,Total members,357.0,People,,/overview/membership
1,Total companies,329.0,Unique companies,,/overview/membership
2,Membership increase,1.4,Since last month,%,/overview/membership
3,Companies with 0-9 employees,36.4,Of members work in companies of this size,%,
4,Top industry,0.0,Of members work in,%,
5,Geographic reach,4.0,placeholder,,
6,Advisory council,11.0,Members represent the network on the True Nort...,,https://www.brabners.com/insights/true-north/t...
7,Northern stars,9.0,Companies have been featured as Northern Stars,,/overview/northern-stars
8,B Corps,13.8,Of member's organisations are B Corps or are j...,%,/themes/sustainable-growth/b-corporations


In [20]:
dashboard.to_csv(os.path.join(SRC_DIR, 'overview/membership/_data/true_north_members_list.csv'), index=False)

In [21]:
time_updated('src/overview/membership/_data/updated.yaml')

Timestamp added to file src/overview/membership/_data/updated.yaml
