# Prevalence of PRIMIS codelists

In [1]:
import pandas as pd
import numpy as np
import os

suffix = f"_{os.environ.get('OPENSAFELY_BACKEND', 'tpp')}"
os.makedirs(os.path.join("..","safe-outputs"), exist_ok=True)

### Load data

In [2]:
df = pd.read_csv(os.path.join("..","output","input.csv"))

### Create ageband

In [3]:
agebands = ['16-39', '40-69', '70+']
conditions = [
    (df['age'] >= 16) & (df['age'] < 40),
    (df['age'] >= 40) & (df['age'] < 70),
    (df['age'] >= 70) & (df['age'] < 120)]
choices = agebands
df['ageband'] = np.select(conditions, choices, default=np.nan)

# filter to largest sex groups
df['sex'] = df['sex'].replace(['I','U'], np.nan)

### Summarise data

In [4]:
# list columns of interest 
cols_allyears = [c for c in df.columns if c not in ["age","patient_id"]]
cols_recent = ["preg", "pregdel"]

In [5]:
# filter to valid sexes and agegroups only
df1 = df.copy().loc[(df["sex"].isin(["F","M"])) & (df["ageband"].isin(agebands))]

# for codes that are only relevant if recent (pregnancy/delivery), remove any older dates
for c in cols_recent:
    df1[(df1[c]<2020)] = np.nan

# summarise by age and gender and suppress low numbers
out = df1[cols_allyears].groupby(["ageband", "sex"]).count().transpose().replace([0,1,2,3,4],0)
out["total"] = out.sum(axis=1)

out.to_csv(os.path.join("..","safe-outputs",f"code-count-by-age-and-sex{suffix}.csv"))
out

ageband,16-39,16-39,40-69,40-69,70+,70+,total
sex,F,M,F,M,F,M,Unnamed: 7_level_1
ast,0,0,0,0,0,0,0
astadm,5,0,0,5,0,0,10
astrx,0,0,0,0,0,0,0
bmi,0,0,0,0,0,0,0
bmi_stage,0,0,0,0,0,0,0
carehome,0,0,0,0,0,0,0
carer,0,0,0,0,0,0,0
chd_cov,0,0,0,5,0,0,5
ckd15,0,0,0,0,0,0,0
ckd35,0,0,0,0,0,0,0
