# Prevalence of PRIMIS codelists

In [1]:
import pandas as pd
import numpy as np
import os

suffix = f"_{os.environ.get('OPENSAFELY_BACKEND', 'tpp')}"
os.makedirs(os.path.join("..","safe-outputs"), exist_ok=True)

### Load data

In [2]:
# import first row to get col names
df_head = pd.read_csv(os.path.join("..","output","input.csv"), nrows=1)

# filter out columns not needed
cols_all = df_head.columns
cols_to_use = [c for c in cols_all if c not in ["hashed_organisation", "patient_id"]]

# import full dataset without columns not needed
df = pd.read_csv(os.path.join("..","output","input.csv"), usecols=cols_to_use)

for col in df.columns:
    if col in ["age", "sex"]:
        continue
    # Most columns only contain years or NaN so we can store them as
    # float16s, which saves a lot of memory
    df[col] = df[col].astype("float16")

### Create ageband

In [3]:
agebands = ['16-39', '40-69', '70+']
conditions = [
    (df['age'] >= 16) & (df['age'] < 40),
    (df['age'] >= 40) & (df['age'] < 70),
    (df['age'] >= 70) & (df['age'] < 120)]
choices = agebands
df['ageband'] = np.select(conditions, choices, default=np.nan)

# filter to largest sex groups
df['sex'] = df['sex'].replace(['I','U'], np.nan)


### Summarise data

In [4]:
# list columns of interest 
cols_allyears = [c for c in df.columns if c not in ["age","patient_id"]]
cols_recent = ["preg", "pregdel"]

In [5]:
# filter to valid sexes and agegroups only
df1 = df.copy().loc[(df["sex"].isin(["F","M"])) & (df["ageband"].isin(agebands))]

### Calculate population denominators

In [6]:

out2 = df1.groupby(["ageband", "sex"])[["registered"]].count().rename(columns={"registered":"total_population"}).transpose()

# calculate total population across all ages and sexes
out2["total"] = out2.sum(axis=1)

out2

ageband,16-39,16-39,40-69,40-69,70+,70+,total
sex,F,M,F,M,F,M,Unnamed: 7_level_1
total_population,1500,1475,1837,1857,629,632,7930


### Codelist counts

In [7]:
# for codes that are only relevant if recent (pregnancy/delivery), remove any older dates
for c in cols_recent:
    df1.loc[(df1[c]<2020), c] = np.nan

# summarise by age and gender
out = df1[cols_allyears].groupby(["ageband", "sex"]).count().transpose()

# suppress low numbers
out = out.replace([0,1,2,3,4],0)

# calculate total count for each codelist across all ages and sexes
out["total"] = out.sum(axis=1)

# add population denominators
out = pd.concat([out,out2])

display(out.tail())

# export to csv
out = out.replace([0,1,2,3,4], "<5")
out.to_csv(os.path.join("..","safe-outputs",f"code-counts-by-age-and-sex{suffix}.csv"))


ageband,16-39,16-39,40-69,40-69,70+,70+,total
sex,F,M,F,M,F,M,Unnamed: 7_level_1
shield,152,140,173,193,60,70,788
smhres,145,118,191,207,61,58,780
spln_cov,147,156,195,171,80,61,810
registered,1500,1475,1837,1857,629,632,7930
total_population,1500,1475,1837,1857,629,632,7930


### Codelist prevalence rates

In [8]:
# calculate rates
for i in out.index.drop("total_population"):
    out.loc[i] = (1000*out.loc[i]/out.loc["total_population"]).round(1)

# export to csv    
out.to_csv(os.path.join("..","safe-outputs",f"code-prevalence-by-age-and-sex{suffix}.csv"))

out

TypeError: loop of ufunc does not support argument 0 of type float which has no callable rint method