# Full ETL of Data to MongoDB Instance

In [1]:
import os
import etl

import pymongo as mongo
import pandas as pd

## Set File Paths

In [28]:
DATADIR = '/Users/admin/notebooks/fdic_data/financial_data_raw/'
FDIC_CODES = [3510, 3511, 7213, 628, 32188]

In [29]:
data_files = os.listdir(DATADIR)

In [30]:
mdrm_file = DATADIR + 'mdrm_codes'

## Create DB Connections

In [6]:
client = mongo.MongoClient()
db = client['ffeic_bank']

In [7]:
fdic_collection = db['ffeic_reports']
company_collection = db['ffeic_company_info']
mdrm_collection = db['mdrm_info']

## ETL Jobs

In [31]:
dicts = list()

for item in data_files:
    if item not in ('.ipynb_checkpoints', 'mdrm_codes'):
        filepath = os.path.join(DATADIR, item)
        print(filepath)
        fdic_etl = etl.FDICDataETL(filepath, FDIC_CODES)
        fdic_etl.full_etl(fdic_collection)
        company_etl = etl.FDICCompanyInfoETL(filepath, FDIC_CODES)
        company_etl.full_etl(company_collection)

/Users/admin/notebooks/fdic_data/financial_data_raw/FFIEC CDR Call Subset of Schedules 2013(1 of 3).txt
/Users/admin/notebooks/fdic_data/financial_data_raw/FFIEC CDR Call Subset of Schedules 2013(2 of 3).txt


  
  # Remove the CWD from sys.path while we load stuff.


/Users/admin/notebooks/fdic_data/financial_data_raw/FFIEC CDR Call Subset of Schedules 2013(3 of 3).txt
/Users/admin/notebooks/fdic_data/financial_data_raw/FFIEC CDR Call Subset of Schedules 2014(1 of 3).txt
/Users/admin/notebooks/fdic_data/financial_data_raw/FFIEC CDR Call Subset of Schedules 2014(2 of 3).txt
/Users/admin/notebooks/fdic_data/financial_data_raw/FFIEC CDR Call Subset of Schedules 2014(3 of 3).txt
/Users/admin/notebooks/fdic_data/financial_data_raw/FFIEC CDR Call Subset of Schedules 2015(1 of 3).txt
/Users/admin/notebooks/fdic_data/financial_data_raw/FFIEC CDR Call Subset of Schedules 2015(2 of 3).txt
/Users/admin/notebooks/fdic_data/financial_data_raw/FFIEC CDR Call Subset of Schedules 2015(3 of 3).txt
/Users/admin/notebooks/fdic_data/financial_data_raw/FFIEC CDR Call Subset of Schedules 2016(1 of 3).txt
/Users/admin/notebooks/fdic_data/financial_data_raw/FFIEC CDR Call Subset of Schedules 2016(2 of 3).txt
/Users/admin/notebooks/fdic_data/financial_data_raw/FFIEC CDR Ca

In [32]:
mdrm_etl = etl.MDRMInfoETL(mdrm_file)
mdrm_etl.full_etl(mdrm_collection)

AssertionError: 

## Review

#### Collection Counts and Example Observations

In [33]:
mdrm_collection.count()

777

In [34]:
for obs in mdrm_collection.find():
    print(obs)

{'_id': ObjectId('5946004ddbc5544250fb7f15'), 'mdrm_item': 'RIAD0093', 'description': 'SAVINGS DEPOSITS (INCLUDING MMDAS)', 'details': {'start_date': '2001-03-31', 'end_date': '9999-12-31', 'description': 'SAVINGS DEPOSITS (INCLUDING MMDAS)', 'confidential': 'No', 'reporting_forms': 'Multiple Forms'}}
{'_id': ObjectId('5946004ddbc5544250fb7f16'), 'mdrm_item': 'RIAD0497', 'description': 'MARKETING AND OTHER PROFESSIONAL SERVICES', 'details': {'start_date': '2001-03-31', 'end_date': '9999-12-31', 'description': 'MARKETING AND OTHER PROFESSIONAL SERVICES', 'confidential': 'No', 'reporting_forms': 'Multiple Forms'}}
{'_id': ObjectId('5946004ddbc5544250fb7f17'), 'mdrm_item': 'RIAD1244', 'description': 'TAXABLE EQUIVALENT ADJUSTMENT TO "INCOME (LOSS) BEFORE INCOME TAXES AND EXTRAORDINARY ITEMS AND OTHER ADJUSTMENTS"', 'details': {'start_date': '1987-03-31', 'end_date': '1995-12-31', 'description': 'TAXABLE EQUIVALENT ADJUSTMENT TO "INCOME (LOSS) BEFORE INCOME TAXES AND EXTRAORDINARY ITEMS AN

In [35]:
company_collection.count()

5

In [36]:
for obs in company_collection.find():
    print(obs)

{'_id': ObjectId('59460039dbc5544250fb7edd'), 'record_updated': '2013-03-31', 'financial_institution_name': 'WELLS FARGO BANK, NATIONAL ASSOCIATION', 'fdic_certificate_number': 3511, 'location': {'financial_institution_address': '101 NORTH PHILLIPS AVENUE ', 'financial_institution_city': 'SIOUX FALLS', 'financial_institution_state': 'SD', 'financial_institution_zip_code': 57104}, 'identifiers': {'idrssd': 451965, 'occ_charter_number': 1, 'ots_docket_number': 0, 'primary_aba_routing_number': 121000248, 'financial_institution_filing_type': 31}}
{'_id': ObjectId('59460039dbc5544250fb7ede'), 'record_updated': '2013-03-31', 'financial_institution_name': 'CITIBANK, N.A.', 'fdic_certificate_number': 7213, 'location': {'financial_institution_address': '701 EAST 60TH STREET NORTH ', 'financial_institution_city': 'SIOUX FALLS', 'financial_institution_state': 'SD', 'financial_institution_zip_code': 57104}, 'identifiers': {'idrssd': 476810, 'occ_charter_number': 1461, 'ots_docket_number': 0, 'prim

In [37]:
fdic_collection.count()

85

In [38]:
fdic_collection.find_one()

{'_id': ObjectId('59460038dbc5544250fb7ecd'),
 'fdic_certificate_number': 3511,
 'financial_institution_name': 'WELLS FARGO BANK, NATIONAL ASSOCIATION',
 'financials': {'RCFD0071': 187583000.0,
  'RCFD0081': 19641000.0,
  'RCFD0426': 22464000.0,
  'RCFD1248': 0.0,
  'RCFD1249': 78000.0,
  'RCFD1250': 2000.0,
  'RCFD1251': 220000.0,
  'RCFD1252': 7000.0,
  'RCFD1253': 560000.0,
  'RCFD1254': 1000.0,
  'RCFD1255': 0.0,
  'RCFD1256': 20000.0,
  'RCFD1583': 102000.0,
  'RCFD1594': 16000.0,
  'RCFD1597': 0.0,
  'RCFD1754': 12346000.0,
  'RCFD1773': 215837000.0,
  'RCFD2130': 662000.0,
  'RCFD2145': 7582000.0,
  'RCFD2150': 3838000.0,
  'RCFD2160': 54238000.0,
  'RCFD2170': 1373600000.0,
  'RCFD2930': 27989000.0,
  'RCFD2948': 1235756000.0,
  'RCFD3000': 207000.0,
  'RCFD3123': 12421000.0,
  'RCFD3163': 21549000.0,
  'RCFD3190': 55913000.0,
  'RCFD3200': 19642000.0,
  'RCFD3210': 137637000.0,
  'RCFD3230': 519000.0,
  'RCFD3300': 1373600000.0,
  'RCFD3505': 170000.0,
  'RCFD3506': 0.0,
  'RC

#### Review Distinct Fields

Notice that $5 * 17 = 85$ so that the count of distinct dates, the count of companies, and the count of financial report observations align.

In [39]:
distinct_dates = fdic_collection.distinct('reporting_period_end_date')

In [40]:
len(distinct_dates)

17

In [41]:
distinct_ids = fdic_collection.distinct('fdic_certificate_number')

In [42]:
distinct_ids

[3511, 7213, 3510, 628, 32188]

In [43]:
distinct_names = fdic_collection.distinct('financial_institution_name')

In [44]:
distinct_names

['WELLS FARGO BANK, NATIONAL ASSOCIATION',
 'CITIBANK, N.A.',
 'BANK OF AMERICA, NATIONAL ASSOCIATION',
 'JPMORGAN CHASE BANK, NATIONAL ASSOCIATION',
 'USAA FEDERAL SAVINGS BANK']