# Dept for Culture, Media & Sport 

## Total Income of DCMS-funded cultural organisations 2022/2023

In [54]:
import pandas as pd
import os
from pathlib import Path
import numpy as np
import re

np.set_printoptions(suppress=True)
pd.set_option('display.float_format', '{:,.0f}'.format)

In [55]:
DATA_DIR = Path('../../data/dcms/Total_income_for_DCMS-funded_cultural_organisations_2022_23_data_tables_Final_SL.ods')
OUT_DIR = Path('../../src/data/dcms/funding/_data')
RELEASE_DIR = OUT_DIR / 'release'
RELEASE_DIR.mkdir(exist_ok=True, parents=True)

In [56]:
def process_sheet(data): 
    data = data.replace('x', 0).dropna(axis='columns', how='all')
    data_latest = data[['Name of organisation', '2022/2023']]

    data_latest = (
        data_latest.loc[data_latest['2022/2023'] != 0]
        .loc[data_latest['Name of organisation'] != 'Grand Total']
    )

    data_latest['Name of organisation'] = (
        data_latest['Name of organisation']
        .apply(lambda x: re.sub(r'\[note \d+\]', '', x))
    )

    data_latest['2022/2023'] = data_latest['2022/2023'].round(0).astype(int)
    
    return data_latest

def clean_orgs(name):
    name = re.sub('Of which from:Arts Council England ', '', name)
    return re.sub(r'\[note \d+\]', '', name)

In [57]:
grant_in_aid = pd.read_excel(DATA_DIR, engine='odf', sheet_name=['3'], skiprows=4)
total_income_generated = pd.read_excel(DATA_DIR, engine='odf', sheet_name=['5'], skiprows=5)

grant_in_aid = pd.DataFrame(grant_in_aid['3'])
total_income_generated = pd.DataFrame(total_income_generated['5'])

In [58]:
grant_in_aid = grant_in_aid[['Name of organisation', '2022/2023']]
total_income_generated = total_income_generated[['Name of organisation', '2022/2023']]

grant_in_aid['Name of organisation'] = grant_in_aid['Name of organisation'].apply(lambda x: clean_orgs(x))
total_income_generated['Name of organisation'] = grant_in_aid['Name of organisation'].apply(lambda x: clean_orgs(x))

def prep_output(df, name):
    df[name] = df[name].str.strip()
    return df

merged = (
    total_income_generated.merge(grant_in_aid, how='outer', on='Name of organisation')
    .rename(columns={
        'Name of organisation': 'organisation',
        '2022/2023_x': 'total_income',
        '2022/2023_y': 'grant_in_aid'
    })
    .replace('x', np.nan)
)
merged = merged.loc[merged['organisation']!='Grand Total']

merged = (
    merged.loc[merged['organisation']!='Grand Total']
    .set_index('organisation')
    .dropna(axis='rows', how='all')
    .fillna(0).astype(int)
)


merged = merged.reset_index()

ace_funding = merged.loc[
    merged['organisation'].isin(['National Portfolio Organisations ', 'Arts Council England'])
]

ace_funding.pipe(prep_output, name='organisation').to_csv(Path(RELEASE_DIR / 'ace_funding.csv'), index=False)

merged = merged.loc[
    ~merged['organisation'].isin(['National Portfolio Organisations ', 'Arts Council England'])
]

merged.drop(columns=['total_income']).pipe(prep_output, name='organisation').to_csv(Path(RELEASE_DIR/'income_vs_funding.csv'), index=False)


  .replace('x', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = df[name].str.strip()


### Process Grant-in-Aid figures

In [59]:
grant_in_aid = process_sheet(grant_in_aid)
grant_in_aid['Name of organisation'] = grant_in_aid['Name of organisation'].apply(lambda x: re.sub('Of which from:Arts Council England ', '', x))

grant_in_aid.to_csv(Path(OUT_DIR/ 'grant_in_aid_latest.csv'), index=False)


  data = data.replace('x', 0).dropna(axis='columns', how='all')


### Process total income

In [60]:
total_income_generated = process_sheet(total_income_generated)
total_income_generated.pipe(prep_output, name='Name of organisation').to_csv(Path(OUT_DIR / 'total_income_generated_latest.csv'), index=False)

  data = data.replace('x', 0).dropna(axis='columns', how='all')
