# Prevalence of PRIMIS codelists

In [1]:
import pandas as pd
import numpy as np
import os

suffix = f"_{os.environ.get('OPENSAFELY_BACKEND', 'tpp')}"
os.makedirs(os.path.join("..","safe-outputs"), exist_ok=True)

### Load data

In [2]:
df = pd.read_csv(os.path.join("..","output","input.csv"))

### Create ageband

In [3]:
agebands = ['16-39', '40-69', '70+']
conditions = [
    (df['age'] >= 16) & (df['age'] < 40),
    (df['age'] >= 40) & (df['age'] < 70),
    (df['age'] >= 70) & (df['age'] < 120)]
choices = agebands
df['ageband'] = np.select(conditions, choices, default=np.nan)

# filter to largest sex groups
df['sex'] = df['sex'].replace(['I','U'], np.nan)

### Summarise data

In [4]:
# list columns of interest 
cols_allyears = [c for c in df.columns if c not in ["age","patient_id"]]
cols_recent = ["preg", "pregdel"]

In [5]:
# filter to valid sexes and agegroups only
df1 = df.copy().loc[(df["sex"].isin(["F","M"])) & (df["ageband"].isin(agebands))]

### Calculate population denominators

In [6]:

out2 = df1.groupby(["ageband", "sex"])[["patient_id"]].nunique().rename(columns={"patient_id":"total_population"}).transpose()

# calculate total population across all ages and sexes
out2["total"] = out2.sum(axis=1)

out2

ageband,16-39,16-39,40-69,40-69,70+,70+,total
sex,F,M,F,M,F,M,Unnamed: 7_level_1
total_population,24,22,22,20,3,1,92


### Codelist counts

In [7]:
# for codes that are only relevant if recent (pregnancy/delivery), remove any older dates
for c in cols_recent:
    df1[(df1[c]<2020)] = np.nan

# summarise by age and gender
out = df1[cols_allyears].groupby(["ageband", "sex"]).count().transpose()

# suppress low numbers
out = out.replace([0,1,2,3,4],0)

# calculate total count for each codelist across all ages and sexes
out["total"] = out.sum(axis=1)

# add population denominators
out = pd.concat([out,out2])

out.tail()

ageband,16-39,16-39,40-69,40-69,70+,70+,total
sex,F,M,F,M,F,M,Unnamed: 7_level_1
sev_obesity,0,0,0,0,0,0,0
shield,0,0,0,0,0,0,0
smhres,0,0,0,0,0,0,0
spln_cov,0,0,0,0,0,0,0
total_population,24,22,22,20,3,1,92


### Codelist prevalence rates

In [8]:
# calculate rates
for i in out.index.drop("total_population"):
    out.loc[i] = (1000*out.loc[i]/out.loc["total_population"]).round(1)

# export to csv    
out.to_csv(os.path.join("..","safe-outputs",f"code-prevalence-by-age-and-sex{suffix}.csv"))

out

ageband,16-39,16-39,40-69,40-69,70+,70+,total
sex,F,M,F,M,F,M,Unnamed: 7_level_1
ast,0.0,0.0,0.0,0.0,0.0,0.0,0.0
astadm,208.3,0.0,0.0,250.0,0.0,0.0,108.7
astrx,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bmi,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bmi_stage,0.0,0.0,0.0,0.0,0.0,0.0,0.0
carehome,0.0,0.0,0.0,0.0,0.0,0.0,0.0
carer,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chd_cov,0.0,0.0,0.0,250.0,0.0,0.0,54.3
ckd15,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ckd35,0.0,0.0,0.0,0.0,0.0,0.0,0.0
