# Part D - Identifying Obese


In [0]:
# First off - load all the silly python libraries we are going to need
import pandas as pd
import numpy as np
import os

from google.colab import auth
from google.cloud import bigquery
from google.colab import files

In [0]:
auth.authenticate_user() #This will allow you to authenticate access to BigQuery

In [0]:
#This is a method that executes a desired SQL query on the database
project_id='hst-953-2018'
os.environ["GOOGLE_CLOUD_PROJECT"]=project_id
# Read data from BigQuery into pandas dataframes.
def run_query(query):
  return pd.io.gbq.read_gbq(query, project_id=project_id, verbose=False, configuration={'query':{'useLegacySql': False}})

In [3]:
# Now load the data. In general you'd load the whole set of notes but that would take
# several minutes so for this example we're just going to use a subset
#notes = pd.read_csv('D.csv')
notes = run_query('''
    SELECT *
    FROM `physionet-data.mimiciii_notes.noteevents`
    WHERE CATEGORY = 'Discharge summary'
    ''')


Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=495642085510-k0tmvj2m941jhre2nbqka17vqpjfddtd.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=Q8SyKe2By0XPhn16vV3avizYNs9hNF&prompt=consent&access_type=offline
Enter the authorization code: 4/dwB5Fk17lT37MvxlGfyhzN7O1XIfrSRv2XlQNK2fRdUjQ93oxF47tjs


In [4]:
notes.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,811,88360,130127,2149-11-20,,,Discharge summary,Report,,,Admission Date: [**2149-11-16**] ...
1,1495,32013,185178,2188-07-14,,,Discharge summary,Report,,,Admission Date: [**2188-7-2**] D...
2,2714,31260,191494,2107-08-11,,,Discharge summary,Report,,,Admission Date: [**2107-8-5**] D...
3,2615,58938,178153,2179-02-06,,,Discharge summary,Report,,,Admission Date: [**2179-1-31**] ...
4,3455,17552,175683,2150-04-06,,,Discharge summary,Report,,,Admission Date: [**2150-4-3**] Discharge ...


In [0]:
# Load the gold standard
#gold = pd.read_csv('obese-gold.csv')
gold = run_query('''
    SELECT *  
    FROM `hst-953-2018.NLP_workshop.obese_gold` 
    ''')


In [11]:
notes['TEXT'][1]

'Admission Date:  [**2188-7-2**]              Discharge Date:   [**2188-7-14**]\n\n\nService: MEDICINE\n\nAllergies:\nPatient recorded as having No Known Allergies to Drugs\n\nAttending:[**First Name3 (LF) 800**]\nChief Complaint:\nDiarrhea, hypotension\n\nMajor Surgical or Invasive Procedure:\nnone\n\nHistory of Present Illness:\n87 male with history of untreated colon cancer, PAF (not on\nanticoagulation), recent hospitalization for hypoxia, weakness\nand a UTI who presents with ongoing weakness and diarrhea. He\nwas noted to be weak at home, and a friend urged him to activate\nhis lifeline. He states that he has been having lots of\ndiarrhea, roughly every two hours, since his last discharge. He\nhas not been having any fevers, chills, abdominal pain, nausea,\nvomiting, or bleeding.\n.\nHis last hospitalization concluded that his weakness was due to\ndeconditioning, and his shortness of breath was due to fluid\noverload. He was treated with flagyl for two days last\nadmission, and, 

In [0]:
# Here is the list of terms we are going to consider "good"
terms = ['obese','overweight','obesity','adipose']

In [23]:
# Now scan through all of the notes. Do any of the terms appear? If so stash the note 
# id for future use

matches = []

for index, row in notes.iterrows():
    if any(x in row['TEXT'] for x in terms):
        matches.append(row['SUBJECT_ID'])

print("Found " + str(len(matches)) + " matching notes.")

Found 6301 matching notes.


In [0]:
# For the patients in those notes, set "obese" true (1) in a the results
myscores = gold.copy()
myscores['obese_1'] = 0 # This sets them all to unknown

for subject_id in matches:
    myscores.loc[myscores["subject_id"] == subject_id,'obese_1'] = 1


In [25]:
# Compute your score

skipped = 0
truepositive = 0
falsepositive = 0
truenegative = 0
falsenegative = 0

for index, row in myscores.iterrows():
    if row['obese_1'] == 0:
        skipped = skipped + 1
    else:
        if row['obese_1'] == 1 and gold.loc[index]['obese_1'] == 1:
            truepositive = truepositive + 1
        elif row['obese_1'] == -1 and gold.loc[index]['obese_1'] == -1:
            truenegative = truenegative + 1
        elif row['obese_1'] == 1 and gold.loc[index]['obese_1'] == -1:
            falsepositive = falsepositive + 1
        elif row['obese_1'] == -1 and gold.loc[index]['obese_1'] == 1:
            falsenegative = falsenegative + 1

print ("Skipped:\t" + str(skipped))
print ("True Pos:\t" + str(truepositive))
print ("True Neg:\t" + str(truenegative))
print ("False Pos:\t" + str(falsepositive))
print ("False Neg:\t" + str(falsenegative))     
print ("SCORE:\t\t" + str(truepositive + truenegative - falsepositive - falsenegative))

Skipped:	76
True Pos:	21
True Neg:	0
False Pos:	3
False Neg:	0
SCORE:		18
