# Raw data checks for table `therapeutics`

### Methods
The therapeutics dataset has been linked to patients in OpenSAFELY-TPP, covering 40% of England's population.
All row/patient counts are rounded to the nearest 10 and counts <=7 removed.
All analytical code and output is available for inspection at the [OpenSAFELY GitHub repository](https://github.com/opensafely)


In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from IPython.display import HTML
from IPython.display import Markdown as md
from IPython.core.display import HTML as Center
from IPython.display import Image, display
%matplotlib inline
import pyodbc
from datetime import date, datetime

sys.path.append('../analysis/')
from utilities import *
from sense_checking import *
from config import dbconn, dataset, columns_to_describe, duplicates

pd.set_option('display.max_colwidth', 250)

# get the server credentials
dbconn = os.environ.get('FULL_DATABASE_URL', None).strip('"')



In [None]:

display(
md(f'''This notebook was run on {date.today().strftime('%Y-%m-%d')}
    and reflects the dataset at this date, 
    but has been filtered to `{schema_filter}`.
    ''')
)

In [None]:

display(md("### Schema"))
get_schema(dbconn, table=dataset, where=schema_filter)


In [None]:

display(md("### Column Summaries"))

counts_of_distinct_values(dbconn, table=dataset, 
    columns=columns_to_describe["columns"], 
    threshold=columns_to_describe["threshold"], 
    where=columns_to_describe["where"],
    include_counts=columns_to_describe["include_counts"]
    )
