# Case Studies - Phase 1

This script processes data collected through an initial phase of NCC Case Studies Research, piloting a data collection scheme with cultural organisations in the Newcastle region.

In [196]:
import os
import pandas as pd
from pathlib import Path
from datetime import datetime

In [197]:
ROOT = Path('../../')
ROOT.resolve()

RAW_DATA = Path(ROOT / 'raw')
CASE_STUDIES_DIR = Path(RAW_DATA / 'case-studies')
OUT_DIR = Path(ROOT/ 'src'/ 'case-studies' / '_data')

In [198]:
def clean_date_column(col):
    if isinstance(col, str) and '(' in col:
        col = col.split(' (')[0] 
    if col == "Feb 24":
        return pd.to_datetime("2024-02-23")
    elif col == "Sep 23":
        return pd.to_datetime("2023-09-23")
    elif col == "Sep 24":
        return pd.to_datetime("2024-09-23")

    return pd.to_datetime(col, errors='coerce')


In [199]:
org_list = pd.DataFrame()

for filename in os.listdir(CASE_STUDIES_DIR):
    file_path = os.path.join(CASE_STUDIES_DIR, filename)
    stem = Path(filename).stem
    if filename.endswith('.xlsx'):  
        data = pd.read_excel(file_path, sheet_name='About your organisation', engine='openpyxl')
        # Select the rows and columns containing organization details
        org_details = data.iloc[0:4].iloc[:, 1:-1].rename(columns={'Unnamed: 1': 'source', 'Unnamed: 2': 'amount'})
        
        org_details['Organisation'] = stem.replace('_', ' ').title()
        org_details.set_index('Organisation', inplace=True)


        org_details = org_details.pivot(columns='source', values='amount').rename(columns={
            'Postcode of main location ("head office")': 'Postcode'
        })

        org_details.index = [stem.replace('_', ' ').title()]
        org_details.index.name = 'Organisation'
        
        org_list = pd.concat([org_list, org_details])

org_list.reset_index(inplace=True)

org_list.to_csv(Path(OUT_DIR / 'org_list.csv'), index=False)



In [200]:
income_df = pd.DataFrame()

for filename in os.listdir(CASE_STUDIES_DIR):
    file_path = os.path.join(CASE_STUDIES_DIR, filename)
    stem = Path(filename).stem
    if filename.endswith('.xlsx'):  
        funding = pd.read_excel(file_path, sheet_name='About your organisation', engine='openpyxl')
        
        income = funding.iloc[9:14].iloc[:, 1:-1].rename(columns={'Unnamed: 1': 'source', 'Unnamed: 2': 'amount'})
        
        total_income = income['amount'].sum()

        income.set_index('source', inplace=True)

        income['percentage'] = (income['amount'] / total_income) * 100

        income.to_csv(Path(OUT_DIR / f'{stem}_income.csv'), index=True)

        income_df[f'{stem}_percentage'] = income['percentage']
        
income_df['total'] = income_df.sum(axis=1)

income_df = income_df['total'] 

income_df.to_csv(Path(OUT_DIR / 'combined_income.csv'), index=True)

In [211]:
for filename in os.listdir(CASE_STUDIES_DIR):
    file_path = os.path.join(CASE_STUDIES_DIR, filename)
    stem = Path(filename).stem
    if filename.endswith('.xlsx'):  
        participation = pd.read_excel(file_path, sheet_name='Participation data', skiprows=5)
        participation = (participation.iloc[0:20, 1:]
                        .rename(columns={'Event / Event Series title': 'Event' })
                        .drop(columns={
                            'Start Date',
                            'End Date'})
        )
        if stem == 'twam':
            participation = participation.drop(columns={'Annual total'})
        if stem == 'centre_for_life':
            participation.columns = [clean_date_column(col) if col != 'Event' else col for col in participation.columns]

        participation = (participation.melt(id_vars='Event', var_name='Date', value_name='Value')
                        .pivot(index='Date', columns='Event', values='Value')
        )
        participation = participation.apply(lambda col: pd.to_numeric(col, downcast='integer', errors='coerce'))
        participation.to_csv(Path(OUT_DIR / f'{stem}_participation.csv'), index = True)
        

Calculate percentage income 

In [202]:
income_df = pd.DataFrame()

for filename in os.listdir(CASE_STUDIES_DIR):
    file_path = os.path.join(CASE_STUDIES_DIR, filename)
    stem = Path(filename).stem
    if filename.endswith('.xlsx'):  
        funding = pd.read_excel(file_path, sheet_name='About your organisation', engine='openpyxl')
        
        income = funding.iloc[9:14].iloc[:, 1:-1].rename(columns={'Unnamed: 1': 'source', 'Unnamed: 2': 'amount'})

        total_income = income['amount'].sum()
        income['amount'] = ((income['amount'] / total_income) * 100).round(2)
        
        income = income.pivot_table(columns='source',values='amount')
        income.index = [stem.replace('_', ' ').title()]
        income.index.name = 'Organisation'

        income_df = pd.concat([income_df, income])

        income_df = income_df.round(2)
    
income_df.to_csv(Path(OUT_DIR / 'percentage_income.csv'), index=True)