# PPP Loan Data > 150k

In [3]:
import pandas as pd
import pickle
import geocoder as geo
import folium as f
import requests
import re

In [29]:
df_loan = pd.read_csv('data/ppp_150.csv')

cols = ['LoanRange', 'BusinessName', 'Address', 'City', 'State', 'Zip', 'NAICSCode', 'BusinessType', 'JobsRetained']

In [30]:
df_loan_strip = df_loan[cols]

In [28]:
df_loan_strip.size

5950962

In [8]:
df_loan_strip.to_json('data/ppp_loan_strip.json')

In [1]:
df_loan_strip.to_csv('data/ppp_loan_strip.csv')

NameError: name 'df_loan_strip' is not defined

In [5]:
df_loan_strip = pd.read_csv("data/ppp_loan_strip.csv", index_col=0)

In [7]:
df_loan_strip.shape

(661218, 9)

In [17]:
df_loan_strip.isnull().sum()

LoanRange           0
BusinessName        8
Address            17
City               15
State               0
Zip                16
NAICSCode        6783
BusinessType     1429
JobsRetained    40506
dtype: int64

In [4]:
df_loan_strip.sort_values(by='LoanRange')

NameError: name 'df_loan_strip' is not defined

### Remove null and invalid states

In [15]:
df_addr = df_loan[['LoanRange', 'Address', 'City', 'State', 'Zip']].dropna()

In [16]:
df_addr[df_addr['State'] == 'XX']

Unnamed: 0,LoanRange,Address,City,State,Zip
661203,"d $350,000-1 million",1850 BOYER AVE E,SEATTLE,XX,98112.0
661204,"d $350,000-1 million",1661 MARTIN RANCH RD,SAN BERNARDINO,XX,92407.0
661205,"d $350,000-1 million",7684 Southrail Road,NORTH CHARLESTON,XX,29420.0
661206,"d $350,000-1 million",2410 Highway 15 South,SUMTER,XX,29150.0
661209,"e $150,000-350,000",501 State Road 13,SAINT JOHNS,XX,32259.0


In [17]:
df_addr = df_addr[df_addr['State'] != 'XX']

In [18]:
len(df_addr)

661195

# Geocoder lib

## Get address for A-Class loans

In [5]:
a_addr = pd.read_csv('data/loan_strip_catA.csv', index_col=0)

In [13]:
len(a_addr)

4840

In [46]:
with requests.Session() as session:
    a_addr['latlng'] = a_addr.apply(lambda row: geo.osm(row['full_addr'], session=session).latlng, axis=1)

In [None]:
pickle.dump(a_addr, open("a_adrr.pkd", "wb"))

In [18]:
a_addr = pickle.load(open("data/a_latlng.pkd", "rb"))

In [28]:
a_addr_na = a_addr[a_addr['latlng'].isna()]

In [34]:
a_addr = a_addr.dropna()

In [35]:
a_addr.shape, a_addr_na.shape

((2870, 11), (1770, 11))

## Get latlng for B-class loans

In [6]:
b_addr = pd.read_csv('data/loan_strip_catB.csv', index_col=0)

In [7]:
len(b_addr)

24838

In [12]:
b_addr['full_addr'] = b_addr.apply(lambda row: (row['Address'].split(',')[0] + ', ' 
                            + row['City'] + ', ' + row['State']), axis=1)

In [23]:
with requests.Session() as session:
    b_addr['latlng'] = b_addr.apply(lambda row: geo.osm(row['full_addr'], session=session).latlng, axis=1)

In [25]:
pickle.dump(b_addr, open("data/b_addr.pkd", "wb"))

In [5]:
b_addr = pickle.load(open("data/b_addr.pkd", "rb"))

In [9]:
b_addr_na = b_addr[b_addr['latlng'].isna()]

In [12]:
b_addr = b_addr.dropna()

In [20]:
pickle.dump(b_addr, open("data/b_latlng.pkd", "wb"))

In [22]:
pickle.dump(b_addr_na, open("data/b_latlng_ba.pkd", "wb"))

### Useful function for checking actual size of an object

https://stackoverflow.com/questions/449560/how-do-i-determine-the-size-of-an-object-in-python

In [61]:
import sys
from types import ModuleType, FunctionType
from gc import get_referents

# Custom objects know their class.
# Function objects seem to know way too much, including modules.
# Exclude modules as well.
BLACKLIST = type, ModuleType, FunctionType


def getsize(obj):
    """sum size of object & members."""
    if isinstance(obj, BLACKLIST):
        raise TypeError('getsize() does not take argument of type: '+ str(type(obj)))
    seen_ids = set()
    size = 0
    objects = [obj]
    while objects:
        need_referents = []
        for obj in objects:
            if not isinstance(obj, BLACKLIST) and id(obj) not in seen_ids:
                seen_ids.add(id(obj))
                size += sys.getsizeof(obj)
                need_referents.append(obj)
        objects = get_referents(*need_referents)
    return size

# Map

In [45]:
# create map
m = f.Map(location=[44.58, -103.46], zoom_start=3)

# global tooltip
tooltip = 'Click for more info'

# generate map
m.save('map.html')

In [46]:
def add_aClass_loans(latlng):
    f.CircleMarker(
        location=latlng,
        radius=7.5,  # use for loan size? scale to size of loan? normalize these?
        popup="$5-10 Million",  # loan value
        color='red',  # color according to some histogram?
        fill=False,
        fill_color='red'
    ).add_to(m)

In [None]:
def add_loans_to_map():
    for i in a_addr.itertuples():
        if i.latlng != None:
            add_aClass_loans(i.latlng)

In [None]:
add_loans_to_map()

In [49]:
def add_bClass_loans(latlng):
    f.CircleMarker(
        location=latlng,
        radius=3.5,  # use for loan size? scale to size of loan? normalize these?
        popup="$2-5 Million",  # loan value
        color='blue',  # color according to some histogram?
        fill=False,
        fill_color='blue'
    ).add_to(m)

In [50]:
def add_loans_to_map():
    for i in b_addr.itertuples():
        if i.latlng != None:
            add_bClass_loans(i.latlng)

In [51]:
add_loans_to_map()

In [52]:
m.save('map.html')

# Separate by loan range

In [75]:
dollars = re.compile(r"(?P<amt>\d+)|(?P<mag>million|thousand)")

In [84]:
re.search(dollars, loan)

<re.Match object; span=(3, 4), match='5'>

In [94]:
lr = loan_range = re.findall(dollars, loan)
lr = str(1.*(int(lr[1][0]) + int(lr[0][0]))/2) + ' ' + lr[2][1]

In [104]:
a = re.search(lr, r"\f")

In [106]:
lr

'7.5 million'

In [60]:
lat, long = geo[0]['geometry']['location']['lat'], geo[0]['geometry']['location']['lng']
m = f.Map([lat, long], zoom_start=12)

In [109]:
f.CircleMarker(
    location=[lat, long],
    radius=7.5,  # use for loan size? scale to size of loan? normalize these?
    popup=str(lr),  # loan value
    color='blue',  # color according to some histogram?
    fill=True,
    fill_color='blue'
).add_to(m)

<folium.vector_layers.CircleMarker at 0x2da094e71c8>

In [110]:
m.save('ppp_test_map.html')