In [1]:
import sys
from pathlib import Path

import pandas as pd

TOP_DIR = Path('../../../').resolve()

if str(TOP_DIR) not in sys.path: sys.path.append(str(TOP_DIR))

In [2]:
from lib.util.geo import postcode_formatter, postcode_to_ward_code

In [3]:
data = pd.read_csv(
    '../../../working/metrics/partnerships/all.csv',
    parse_dates=['Start date', 'End date'],
    usecols=[
        # 'Partner Lead Contact',
        'Partner Organisation',
        'List of countries involved',
        'End date',
        # 'Baseline survey completed by partner',
        'Start date',
        'Partner Venue Postcode',
        # 'Notes',
        'Partnership Status',
        'Project Artform',
        # 'Project Description',
        'Project Name',
        # 'Offered SAIL training',
        'Project Output',
        # 'L23 economic contribution',
        'International element to project?',
        'Online/in person/both',
        'Number of countries involved',
        'Season',
        # 'Partner Venue Name',
        # 'Contract signed by partner',
        'Duration of activity/event',
        # 'LEEDS 2023 Assignee.id',
        # 'LEEDS 2023 Assignee.email',
        # 'LEEDS 2023 Assignee.name',
        # 'Last Modified By.id',
        # 'Last Modified By.email',
        # 'Last Modified By.name',
        # 'Activity log completed by partner',
        # 'Final survey completed by partner',
        # 'Number of unique countries',
        'Contains heritage, young people',
        # 'Total budget',
        'Digital audiences',
        'Audience numbers',
        'number of participants',
        'Total number of events'
    ]
)

In [4]:
data.columns.to_list()

['Partner Organisation',
 'List of countries involved',
 'End date',
 'Start date',
 'Partner Venue Postcode',
 'Partnership Status',
 'Project Artform',
 'Project Name',
 'Project Output',
 'International element to project?',
 'Online/in person/both',
 'Number of countries involved',
 'Season',
 'Duration of activity/event',
 'Contains heritage, young people',
 'Digital audiences',
 'Audience numbers',
 'number of participants',
 'Total number of events']

In [5]:
data.columns = data.columns.str.strip().str.lower().str.replace(r'[^a-z0-9]+', '_', regex=True)

In [6]:
data['partner_venue_ward_code'] = data['partner_venue_postcode'].pipe(postcode_formatter).map(postcode_to_ward_code)

In [7]:
data['list_of_countries_involved'] = (
  data['list_of_countries_involved']
    .str.strip()
    .str.replace('[\.]', '', regex=True)
    .str.replace(r'[\n]+|\s*,\s*', '|', regex=True)
    .str.replace('The Netherlands','Netherlands')
    .str.strip('|')
    .str.split('|')
)


In [8]:
data['number_of_countries_involved'] = data.loc[data.list_of_countries_involved.notna(), 'list_of_countries_involved'].map(len)
data['number_of_countries_involved'].fillna(0, inplace=True)

In [9]:
OUTPUT_DIR = Path('../../../data/metrics/partnerships')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [10]:
data.to_csv(
  OUTPUT_DIR.joinpath('partnerships.csv'),
  index=False
)

In [11]:
data.loc[data.list_of_countries_involved.notna(), 'list_of_countries_involved'].explode().drop_duplicates().sort_values().reset_index(drop=True).to_frame('unique_countries').to_csv(OUTPUT_DIR.joinpath('countries.csv'), index=False)