In [6]:
import pandas as pd

In [7]:
#Review Hop Teaming Dataset
hop_review = pd.read_csv('data/DocGraph_Hop_Teaming_2017.csv', nrows = 100)

In [8]:
hop_review.head()

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1003863580,1000000004,19,19,108.895,84.598
1,1043250400,1000000004,20,20,87.0,77.173
2,1033239413,1000000004,20,20,58.8,76.982
3,1033142146,1000000004,491,535,10.232,36.558
4,1013957562,1000000004,25,26,78.692,59.305


In [48]:
hop_review.shape

(100, 6)

In [47]:
#Review nppes Dataset 
nppes_review = pd.read_csv('data/npidata_pfile_20050523-20210207.csv', nrows = 100)
nppes_review

Unnamed: 0,NPI,Entity Type Code,Replacement NPI,Employer Identification Number (EIN),Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,...,Healthcare Provider Taxonomy Group_7,Healthcare Provider Taxonomy Group_8,Healthcare Provider Taxonomy Group_9,Healthcare Provider Taxonomy Group_10,Healthcare Provider Taxonomy Group_11,Healthcare Provider Taxonomy Group_12,Healthcare Provider Taxonomy Group_13,Healthcare Provider Taxonomy Group_14,Healthcare Provider Taxonomy Group_15,Certification Date
0,1679576722,1.0,,,,WIEBE,DAVID,A,,,...,,,,,,,,,,
1,1588667638,1.0,,,,PILCHER,WILLIAM,C,DR.,,...,,,,,,,,,,
2,1497758544,2.0,,<UNAVAIL>,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,...,,,,,,,,,,
3,1306849450,1.0,,,,SMITSON,HAROLD,LEROY,DR.,II,...,,,,,,,,,,
4,1215930367,1.0,,,,GRESSOT,LAURENT,,DR.,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1649273673,1.0,,,,KOPCZYNSKI,TODD,M,,,...,,,,,,,,,,09/29/2020
96,1083617013,2.0,,<UNAVAIL>,AMOSKEAG HEALTH,,,,,,...,,,,,,,,,,01/31/2020
97,1992708929,2.0,,<UNAVAIL>,NOVAMED MANAGEMENT SERVICES LLC,,,,,,...,,,,,,,,,,
98,1801899836,1.0,,,,ZICHELLA,SARAH,L,MRS.,,...,,,,,,,,,,


In [46]:
#Determine which columns have Taxonomy Switch as "Y".  Answer:  1,2,3,4,7
nppes_review['Healthcare Provider Primary Taxonomy Switch_4'].unique()

array([nan, 'X', 'Y', 'N'], dtype=object)

Let's say that our goal is to find all incidents where the Tencode Description was 'SHOTS FIRED'.

One thing we could try is to use the chunksize argument in our pd.read_csv call. What this does is to create an iterable which returns just the specified number of rows at a time.

Iterating through a file using chunks can look like this, but can also be structured differently (for example, using a list comprehension).

``` 
chunks = pd.read_csv('data/Metro_Nashville_Police_Department_Calls_for_Service.csv', chunksize = 10000)

for chunk in chunks:
    # Do something 
```

Here is what we need to do:

1. Create an iterable by using the chunksize argument.

2. For each chunk, filter to just the rows where the 'Tencode Description' column is 'SHOTS FIRED'. Store these rows.

3. Concatenate all the results together into a single dataframe.

In [9]:
#example
# shots_fired = pd.concat([chunk[chunk['Tencode Description'] == 'SHOTS FIRED'] 
#                          for chunk in pd.read_csv('data/Metro_Nashville_Police_Department_Calls_for_Service.csv', 
#                                                   chunksize = 10000)])


The above solution would work for one-off tasks. However, if you are going to be working extensively with a dataset or merging two large datasets, it would be a bit cumbersome to have to chunk through one or both datasets multiple times.

As an alternative to working in just Python, we can use a different tool which works better on large datasets - SQL.

In this notebook, we will make use of SQLite, which is a file-based relational database management system. We can interact with SQLite databases through the sqlite3 library.

In [49]:
import sqlite3

First, we need to connect to our database. The connect function will either create a new database if one does not already exist or connect to an existing one.

In [50]:
#create an empty df
db = sqlite3.connect('data/hop_teaming.sqlite')

Now, we can chunk through the data and for each row, add the rows to a table in our sqlite database. 
To keep track of how much progress has been made, we can use the `tqdm` library.

In [51]:
from tqdm.notebook import tqdm  
#tqdm is a function that helps with chunking by providing time bar 

In [52]:
for chunk in tqdm(pd.read_csv('data/DocGraph_Hop_Teaming_2017.csv', chunksize = 10000)):
    chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]      # Clean up the column names
    chunk.to_sql('team', db, if_exists = 'append', index = False)            # Append the chunk to a calls table
    
    #Q: on 3rd line, 'team' is the new table name within large db called hop_teaming.sqlite

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




To speed up queries which use a specific column, we can create an **index** on that column. This causes the database to store that column in a way that helps it to retrieve rows quicker.

In [53]:
#ignore error code below if I run 2+ times 
db.execute('CREATE INDEX from_npi ON team(from_npi)')

<sqlite3.Cursor at 0x1ef5c4ab8f0>

Finally, we should close our database connection.

In [54]:
db.close()

Now, let's see how long it takes to find all rows corresponding to 'SHOTS FIRED'.

In [14]:
#shots_sqlite.head()

Unnamed: 0,event_number,call_received,complaint_number,shift,tencode,tencode_description,tencode_suffix,tencode_suffix_description,disposition_code,disposition_description,block,street_name,unit_dispatched,sector,zone,rpa,latitude,longitude,mapped_location
0,PD202000845566,11/21/2020 09:24:37 AM,,,83,SHOTS FIRED,P,PROGRESS,10,,,,611A,TE,623K,3107.0,,,
1,PD202000712516,09/23/2020 09:47:43 PM,,,83,SHOTS FIRED,P,PROGRESS,11,,,,825B,N,617,4325.0,,,
2,PD202000711572,09/23/2020 01:42:22 PM,,,83,SHOTS FIRED,R,REPORT,10,,,,830A,MT,833,7109.0,,,
3,PD202000712380,09/23/2020 07:54:59 PM,,,83,SHOTS FIRED,P,PROGRESS,11,,,,821B,N,625,3173.0,,,
4,PD202000712444,09/23/2020 08:38:40 PM,,,83,SHOTS FIRED,P,PROGRESS,11,,,,724B,N,627,3227.0,,,


The Metro Police Department Incidents database 

Add a table named "incidents" to your police_calls.sqlite database and load in the Metro_Nashville_Police_Department_Incidents.csv file


In [None]:
db = sqlite3.connect('data/hop_teaming.sqlite')

for chunk in tqdm(pd.read_csv('data/npidata_pfile_20050523-20210207.csv', chunksize = 10000)):
    chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]      # Clean up the column names
    chunk.to_sql('providers', db, if_exists = 'append', index = False)            # Append the chunk to an incidents table
    
db.close()

# doing the same here as we did above with the 'team' table.  Calling this table 'providers' 
     

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  await eval(code_obj, self.user_global_ns, self.user_ns)
  await eval(code_obj, self.user_global_ns, self.user_ns)
  await eval(code_obj, self.user_global_ns, self.user_ns)
  await eval(code_obj, self.user_global_ns, self.user_ns)
  await eval(code_obj, self.user_global_ns, self.user_ns)
  await eval(code_obj, self.user_global_ns, self.user_ns)
  await eval(code_obj, self.user_global_ns, self.user_ns)
  await eval(code_obj, self.user_global_ns, self.user_ns)
  await eval(code_obj, self.user_global_ns, self.user_ns)
  await eval(code_obj, self.user_global_ns, self.user_ns)
  await eval(code_obj, self.user_global_ns, self.user_ns)
  await eval(code_obj, self.user_global_ns, self.user_ns)
  await eval(code_obj, self.user_global_ns, self.user_ns)
  await eval(code_obj, self.user_global_ns, self.user_ns)


In [59]:
# create a query on nppes which only calls in necessary columns 
db = sqlite3.connect('data/hop_teaming.sqlite')

query_nppes = '''
SELECT 
npi, entity_type_code AS entity_type, provider_organization_name_(legal_business_name) AS org_name, 
provider_last_name_(legal_name) AS prov_lastname, provider_first_name AS prov_firstname, provider_middle_name AS prov_midname, 
provider_name_suffix_text AS prov_suffix, provider_credential_text AS prov_credential, provider_other_last_name AS prov_otherlast, 
provider_first_line_business_practice_location_address AS prov_location1, provider_second_line_business_practice_location_address 
AS prov_location2, provider_business_practice_location_address_city_name AS prov_city, provider_business_practice_location_address_state_name
AS prov_state, provider_business_practice_location_address_postal_code AS prov_postal, 
healthcare_provider_taxonomy_code_1 AS tax_code1,
healthcare_provider_taxonomy_code_2 AS tax_code2,
healthcare_provider_taxonomy_code_3 AS tax_code3, 
healthcare_provider_taxonomy_code_4 AS tax_code4,
healthcare_provider_taxonomy_code_7 AS tax_code7,
healthcare_provider_primary_taxonomy_switch_1 AS tax_switch1,
healthcare_provider_primary_taxonomy_switch_2 AS tax_switch2,
healthcare_provider_primary_taxonomy_switch_3 AS tax_switch3, 
healthcare_provider_primary_taxonomy_switch_4 AS tax_switch4,
healthcare_provider_primary_taxonomy_switch_7 AS tax_switch7 

FROM providers

WHERE tax_switch1 = 'Y' OR tax_switch2 = 'Y' OR tax_switch3 = 'Y' OR tax_switch4 = 'Y' OR tax_switch7 = 'Y'

LIMIT 5; 
'''

nppes_sqlite = pd.read_sql(query_nppes, db)

db.close()

#taking the sqlite db and the specific 'nppes provider' table query and converting it to a python df 


DatabaseError: Execution failed on sql '
SELECT 
npi, entity_type_code AS entity_type, provider_organization_name_(legal_business_name) AS org_name, 
provider_last_name_(legal_name) AS prov_lastname, provider_first_name AS prov_firstname, provider_middle_name AS prov_midname, 
provider_name_suffix_text AS prov_suffix, provider_credential_text AS prov_credential, provider_other_last_name AS prov_otherlast, 
provider_first_line_business_practice_location_address AS prov_location1, provider_second_line_business_practice_location_address 
AS prov_location2, provider_business_practice_location_address_city_name AS prov_city, provider_business_practice_location_address_state_name
AS prov_state, provider_business_practice_location_address_postal_code AS prov_postal, 
healthcare_provider_taxonomy_code_1 AS tax_code1,
healthcare_provider_taxonomy_code_2 AS tax_code2,
healthcare_provider_taxonomy_code_3 AS tax_code3, 
healthcare_provider_taxonomy_code_4 AS tax_code4,
healthcare_provider_taxonomy_code_7 AS tax_code7,
healthcare_provider_primary_taxonomy_switch_1 AS tax_switch1,
healthcare_provider_primary_taxonomy_switch_2 AS tax_switch2,
healthcare_provider_primary_taxonomy_switch_3 AS tax_switch3, 
healthcare_provider_primary_taxonomy_switch_4 AS tax_switch4,
healthcare_provider_primary_taxonomy_switch_7 AS tax_switch7 

FROM providers

WHERE tax_switch1 = 'Y' OR tax_switch2 = 'Y' OR tax_switch3 = 'Y' OR tax_switch4 = 'Y' OR tax_switch7 = 'Y'

LIMIT 5; 
': no such table: providers

If we want to match calls to incidents, we can use the complaint_number column from the calls database and the incident_number column from the incidents database.

To speed up this process, we can created indexes on these two columns.

In [21]:
# db = sqlite3.connect('data/hop_teaming.sqlite')

# db.execute('CREATE INDEX complaint_number ON calls(complaint_number)')
# db.execute('CREATE INDEX incident_number ON incidents(incident_number)')

# db.close()

#Q:  why create indexes again?  in order to join?  what happened to the other column that was indexed?

OperationalError: index complaint_number already exists

Now, let's grab all SHOTS FIRED calls for which there is an associated incident and bring in the incident information.

In [22]:
# query = """
# SELECT * FROM calls AS c 
# JOIN incidents AS i 
# ON c.complaint_number = i.incident_number 
# WHERE tencode_description = 'SHOTS FIRED'
# """
 


In [23]:
with sqlite3.connect('data/police_calls.sqlite') as db: 
    shots_sqlite = pd.read_sql(query, db)

#Q does this query name duplicate the earlier query name?  Yes, overrode the previous "query".  If orig query needs to be
#kept, then call this query diff name

In [24]:
shots_sqlite.shape

(1744, 50)

In [25]:
shots_sqlite.head()

Unnamed: 0,event_number,call_received,complaint_number,shift,tencode,tencode_description,tencode_suffix,tencode_suffix_description,disposition_code,disposition_description,...,weapon_description,victim_number,domestic_related,victim_type,victim_description,victim_gender,victim_race,victim_ethnicity,victim_county_resident,mapped_location
0,PD202000711384,09/23/2020 12:11:55 PM,20200610000.0,,83,SHOTS FIRED,R,REPORT,1,,...,CLUB,1,0,I,INDIVIDUAL (18 AND OVER),M,B,Non-Hispanic,RESIDENT,POINT (-86.67 36.04)
1,PD202000711384,09/23/2020 12:11:55 PM,20200610000.0,,83,SHOTS FIRED,R,REPORT,1,,...,CLUB,1,0,I,INDIVIDUAL (18 AND OVER),M,B,Non-Hispanic,RESIDENT,POINT (-86.67 36.04)
2,PD202000712332,09/23/2020 07:27:22 PM,20200610000.0,,83,SHOTS FIRED,P,PROGRESS,10,,...,NONE,1,0,U,UNKNOWN,U,U,Non-Hispanic,,
3,PD202000712332,09/23/2020 07:27:22 PM,20200610000.0,,83,SHOTS FIRED,P,PROGRESS,10,,...,NONE,1,0,U,UNKNOWN,U,U,Non-Hispanic,,
4,PD202000714837,09/24/2020 09:27:30 PM,20200610000.0,,83,SHOTS FIRED,R,REPORT,1,,...,NONE,1,0,I,INDIVIDUAL (18 AND OVER),M,W,Hispanic,RESIDENT,POINT (-86.653 36.054)
