In [1]:
import datetime as dt
import numpy as np 
import pandas as pd
import argparse
from pathlib import Path
from pyspark import SparkContext
from pyspark.sql.session import SparkSession

In [2]:
input_file = './input/complaints_sample.csv'
output_path = 'output/'

In [3]:
sc = SparkContext()
spark = SparkSession(sc)

In [29]:
complaint_info = sc.textFile(input_file, use_unicode=True).cache()
print(list(enumerate(complaint_info.first().split(','))))
complaint_info.getNumPartitions()

[(0, 'Date received'), (1, 'Product'), (2, 'Sub-product'), (3, 'Issue'), (4, 'Sub-issue'), (5, 'Consumer complaint narrative'), (6, 'Company public response'), (7, 'Company'), (8, 'State'), (9, 'ZIP code'), (10, 'Tags'), (11, 'Consumer consent provided?'), (12, 'Submitted via'), (13, 'Date sent to company'), (14, 'Company response to consumer'), (15, 'Timely response?'), (16, 'Consumer disputed?'), (17, 'Complaint ID')]


2

In [30]:
def extract_complaints(partId, list_of_records):
    if partId==0: 
        next(list_of_records) # skipping the header line
    import csv
    reader = csv.reader(list_of_records)
    for row in reader:
        year_received = dt.datetime.strptime(row[0], '%m/%d/%Y').year
        product = row[1].lower()
        company = row[7].lower()
        if ',' in product:
            product = '"' + product + '"'
            
        yield ((product, year_received, company), 1)

complaints = complaint_info.mapPartitionsWithIndex(extract_complaints)

for entry in complaints.take(10):
    print(entry)

(('"credit reporting, credit repair services, or other personal consumer reports"', 2020, 'transunion intermediate holdings, inc.'), 1)
(('debt collection', 2019, 'transworld systems inc'), 1)
(('"credit reporting, credit repair services, or other personal consumer reports"', 2019, 'experian information solutions inc.'), 1)
(('"credit reporting, credit repair services, or other personal consumer reports"', 2019, 'transunion intermediate holdings, inc.'), 1)
(('debt collection', 2019, 'diversified consultants, inc.'), 1)
(('vehicle loan or lease', 2019, 'hyundai capital america'), 1)
(('"credit reporting, credit repair services, or other personal consumer reports"', 2020, 'experian information solutions inc.'), 1)
(('"credit reporting, credit repair services, or other personal consumer reports"', 2020, 'experian information solutions inc.'), 1)
(('"credit reporting, credit repair services, or other personal consumer reports"', 2019, 'transunion intermediate holdings, inc.'), 1)
(('credi

In [76]:
# ( (product, year, company), 1)
# ( (product, year) (company, total_company_complaints) )
# 
# ...
# ( (product, year), (total_complaints, total_companies, comp.with.highest.complaints.% ) )
temp = complaint_info.mapPartitionsWithIndex(extract_complaints) \
    .reduceByKey(lambda x,y: x+y) \
    .map(lambda x: ( (x[0][0],x[0][1]), [(x[0][2], x[1])] ) ) \
    .reduceByKey(lambda x,y: x+y) \
    .collect()

#     .map(lambda x: ( (x[0][0],x[0][1]), (x[0][2], x[1], 1) ) ) \
#     .reduceByKey(lambda a, b: (a[0]+b[0], a[1]+b[1])) \

# .map(lambda (x,y): (x, [y])).reduceByKey(lambda p,q: p+q).collect()

In [77]:
temp

[(('"credit reporting, credit repair services, or other personal consumer reports"',
   2020),
  ['transunion intermediate holdings, inc.',
   73,
   'experian information solutions inc.',
   25,
   'nissan motor acceptance corporation',
   1,
   'discover bank',
   1,
   'franklin collection service, inc.',
   1,
   'equifax, inc.',
   39,
   "conn's, inc.",
   1,
   'ocwen financial corporation',
   1,
   'jpmorgan chase & co.',
   1,
   'lexisnexis',
   1]),
 (('"money transfer, virtual currency, or money service"', 2019),
  ['paypal holdings, inc',
   29,
   'bank of america, national association',
   7,
   'united services automobile association',
   2,
   'ubs bank usa',
   1,
   'remitly, inc.',
   1,
   'worldremit corp.',
   1,
   'premier consumer credit counseling, inc.',
   1,
   'synovus bank',
   1,
   'wells fargo & company',
   3,
   'synchrony financial',
   5,
   'western union company, the',
   6,
   'fidelity national financial, inc',
   1,
   'goldman sachs bank us