# Solution: More! Pandas Exercise

## To get started we'll need __`numpy`__ and __`pandas`__

In [1]:
import numpy as np
import pandas as pd

## Read the Consumer Complaints data file named __`data/Consumer_Complaints.csv`__ into a DataFrame

In [2]:
data = pd.read_csv('data/Consumer_Complaints.csv')

## Determine the shape of the DataFrame

In [3]:
data.shape

(957642, 18)

## Investigate the first few rows of the DataFrame and find out what columns are present

In [4]:
data.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,03/12/2014,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,M&T BANK CORPORATION,MI,48382,,,Referral,03/17/2014,Closed with explanation,Yes,No,759217
1,10/01/2016,Credit reporting,,Incorrect information on credit report,Account status,I have outdated information on my credit repor...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",AL,352XX,,Consent provided,Web,10/05/2016,Closed with explanation,Yes,No,2141773
2,10/17/2016,Consumer Loan,Vehicle loan,Managing the loan or lease,,I purchased a new car on XXXX XXXX. The car de...,,"CITIZENS FINANCIAL GROUP, INC.",PA,177XX,Older American,Consent provided,Web,10/20/2016,Closed with explanation,Yes,No,2163100
3,06/08/2014,Credit card,,Bankruptcy,,,,AMERICAN EXPRESS COMPANY,ID,83854,Older American,,Web,06/10/2014,Closed with explanation,Yes,Yes,885638
4,09/13/2014,Debt collection,Credit card,Communication tactics,Frequent or repeated calls,,,"CITIBANK, N.A.",VA,23233,,,Web,09/13/2014,Closed with explanation,Yes,Yes,1027760


## How many consumer complaints are there per state?

In [5]:
data['State'].value_counts()

CA    135002
FL     91133
TX     76955
NY     64340
GA     47759
NJ     36870
IL     35988
PA     33399
VA     29547
MD     29070
OH     29029
NC     28458
MI     23307
AZ     21077
WA     19482
MA     17921
CO     16159
TN     15388
SC     13350
MO     12863
NV     11734
CT     11089
IN     10977
LA     10771
OR     10660
MN     10601
AL     10541
WI     10175
KY      6951
OK      6237
       ...  
MS      4957
DE      4841
KS      4739
NM      4512
AR      4347
NH      4035
IA      3992
ID      3076
HI      3038
ME      2983
RI      2903
NE      2901
PR      2453
WV      2424
MT      1529
VT      1433
SD      1268
AK      1056
ND       951
WY       869
AE       388
AP       271
VI       200
GU       152
FM        48
MP        31
MH        30
AS        25
AA        19
PW        13
Name: State, Length: 62, dtype: int64

## What ZIP code has the highest number of complaints?

In [6]:
data['ZIP code'].value_counts()[:1]

300XX    4887
Name: ZIP code, dtype: int64

## What are the top five products for complaints?

In [7]:
data['Product'].value_counts()[:5]

Mortgage                   247626
Debt collection            182186
Credit reporting           140433
Credit card                 89191
Bank account or service     86206
Name: Product, dtype: int64

## What are the top five states for consumer complaints?

In [8]:
data['State'].value_counts()[:5]

CA    135002
FL     91133
TX     76955
NY     64340
GA     47759
Name: State, dtype: int64

## What is potentially misleading about these results? 
* We can solve this problem by normalizing the results against population data.
* Our complaints database doesn't have this information though, so, let's read the information from the file __`data/states.csv`__ into a DataFrame

In [9]:
states = pd.read_csv('data/states.csv')
states

Unnamed: 0,State,Abbrev,Count,Population
0,Alabama,AL,129,4874747.0
1,Alaska,AK,35,739795.0
2,American Samoa,AS,1,51504.0
3,Arizona,AZ,155,7016270.0
4,Arkansas,AR,108,3004279.0
5,California,CA,1246,39536653.0
6,Colorado,CO,171,5607154.0
7,Connecticut,CT,114,3588184.0
8,Delaware,DE,23,961939.0
9,District of Columbia,DC,33,693972.0


## Investigate the first few rows of the DataFrame

In [10]:
states.head()

Unnamed: 0,State,Abbrev,Count,Population
0,Alabama,AL,129,4874747.0
1,Alaska,AK,35,739795.0
2,American Samoa,AS,1,51504.0
3,Arizona,AZ,155,7016270.0
4,Arkansas,AR,108,3004279.0


## The __`Count`__ column is irrelevant, so...
* Create a new DataFrame with only the 'Abbrev' and 'Population' columns

In [11]:
states = states[['Abbrev', 'Population']]
states

Unnamed: 0,Abbrev,Population
0,AL,4874747.0
1,AK,739795.0
2,AS,51504.0
3,AZ,7016270.0
4,AR,3004279.0
5,CA,39536653.0
6,CO,5607154.0
7,CT,3588184.0
8,DE,961939.0
9,DC,693972.0


## We're going to want to merge the DataFrames on the two-letter abbreviation
* This is called __`State`__ in the first data set so make sure the second dataset has the correct column names

In [12]:
states.columns = ['State', 'Population']
states

Unnamed: 0,State,Population
0,AL,4874747.0
1,AK,739795.0
2,AS,51504.0
3,AZ,7016270.0
4,AR,3004279.0
5,CA,39536653.0
6,CO,5607154.0
7,CT,3588184.0
8,DE,961939.0
9,DC,693972.0


In [13]:
data = pd.merge(data, states, on='State')
data.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,Population
0,03/12/2014,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,M&T BANK CORPORATION,MI,48382,,,Referral,03/17/2014,Closed with explanation,Yes,No,759217,9962311.0
1,07/25/2014,Mortgage,Conventional adjustable mortgage (ARM),"Loan modification,collection,foreclosure",,,,"BANK OF AMERICA, NATIONAL ASSOCIATION",MI,49441,,,Web,07/25/2014,Closed with explanation,Yes,No,953726,9962311.0
2,10/10/2014,Mortgage,Conventional adjustable mortgage (ARM),"Loan modification,collection,foreclosure",,,,JPMORGAN CHASE & CO.,MI,48154,,,Web,10/10/2014,Closed with explanation,Yes,Yes,1067722,9962311.0
3,04/05/2017,Other financial service,Credit repair,Fraud or scam,,i was tolled i would have a credit score of XX...,Company disputes the facts presented in the co...,Park View Credit,MI,488XX,Servicemember,Consent provided,Web,04/05/2017,Closed with explanation,Yes,No,2417763,9962311.0
4,02/10/2015,Money transfers,Domestic (US) money transfer,Fraud or scam,,,,MONEYGRAM PAYMENT SYSTEMS WORLDWIDE INC,MI,49022,Older American,,Web,02/10/2015,Closed with monetary relief,Yes,No,1232991,9962311.0


## Generate a new DataFrame that contains the number of  complaints per state and keeps track of those counts

In [14]:
by_state = pd.DataFrame(data['State'].value_counts().reset_index())
by_state.columns = 'State Count'.split()

In [15]:
by_state

Unnamed: 0,State,Count
0,CA,135002
1,FL,91133
2,TX,76955
3,NY,64340
4,GA,47759
5,NJ,36870
6,IL,35988
7,PA,33399
8,VA,29547
9,MD,29070


In [16]:
# We need to group by State, which will produce and then count the 
# number of complaints per state, which we can do with the size()
# method. We can use the reset_index() method to give a reasonable
# name to the column that was produced.
complaints_by_state = data.groupby(['State']).size().reset_index()
complaints_by_state

Unnamed: 0,State,0
0,AK,1056
1,AL,10541
2,AR,4347
3,AS,25
4,AZ,21077
5,CA,135002
6,CO,16159
7,CT,11089
8,DC,5415
9,DE,4841


## Merge the __`complaints_by_state`__ DataFrame and the states DataFrame on the __`State`__ column
* Since there are U.S. territories in one of the datasets and not the other, we have some NaN results–get rid of those

In [17]:
merged = pd.merge(by_state, states, on='State').dropna()

## Inspect the first few rows to ensure it worked properly

In [18]:
merged.head()

Unnamed: 0,State,Count,Population
0,CA,135002,39536653.0
1,FL,91133,20984400.0
2,TX,76955,28304596.0
3,NY,64340,19849399.0
4,GA,47759,10429379.0


## Normalize the Population
1. Divide the counts column by the __`Population`__ 
* Store the result in a new columnn called 'population_normalized'

In [19]:
merged['population_normalized'] = 1000 * merged['Count'] / merged['Population']

## Check out the first few rows of the data to make sure it looks like you expect

In [20]:
merged.head()

Unnamed: 0,State,Count,Population,population_normalized
0,CA,135002,39536653.0,3.414604
1,FL,91133,20984400.0,4.342893
2,TX,76955,28304596.0,2.718816
3,NY,64340,19849399.0,3.241408
4,GA,47759,10429379.0,4.579276


## What are the top 10 states for complaints based upon raw counts?

In [21]:
merged.sort_values('Count', ascending=False)[:10]

Unnamed: 0,State,Count,Population,population_normalized
0,CA,135002,39536653.0,3.414604
1,FL,91133,20984400.0,4.342893
2,TX,76955,28304596.0,2.718816
3,NY,64340,19849399.0,3.241408
4,GA,47759,10429379.0,4.579276
5,NJ,36870,9005644.0,4.094099
6,IL,35988,12802023.0,2.811118
7,PA,33399,12805537.0,2.608169
8,VA,29547,8470020.0,3.488422
9,MD,29070,6052177.0,4.80323


## What are the top 10 states for complaints normalized by population?

In [22]:
merged.sort_values('population_normalized', ascending=False)[:10]

Unnamed: 0,State,Count,Population,population_normalized
30,DC,5415,693972.0,7.802908
33,DE,4841,961939.0,5.032544
9,MD,29070,6052177.0,4.80323
4,GA,47759,10429379.0,4.579276
1,FL,91133,20984400.0,4.342893
5,NJ,36870,9005644.0,4.094099
20,NV,11734,2998039.0,3.913892
8,VA,29547,8470020.0,3.488422
0,CA,135002,39536653.0,3.414604
3,NY,64340,19849399.0,3.241408


In [23]:
merged

Unnamed: 0,State,Count,Population,population_normalized
0,CA,135002,39536653.0,3.414604
1,FL,91133,20984400.0,4.342893
2,TX,76955,28304596.0,2.718816
3,NY,64340,19849399.0,3.241408
4,GA,47759,10429379.0,4.579276
5,NJ,36870,9005644.0,4.094099
6,IL,35988,12802023.0,2.811118
7,PA,33399,12805537.0,2.608169
8,VA,29547,8470020.0,3.488422
9,MD,29070,6052177.0,4.80323


In [24]:
%store merged

Stored 'merged' (DataFrame)
