# Exercise: Pandas

## To get started we'll need __`numpy`__ and __`pandas`__

In [1]:
import numpy as np
import pandas as pd

## Read the Consumer Complaints data file named __`data/Consumer_Complaints.csv`__ into a DataFrame

In [2]:
data = pd.read_csv('data/Consumer_Complaints.csv')

## Determine the shape of the DataFrame

In [3]:
data.shape

(957642, 18)

## Investigate the first few rows of the DataFrame and find out what columns are present

In [4]:
data.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,03/12/2014,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,M&T BANK CORPORATION,MI,48382,,,Referral,03/17/2014,Closed with explanation,Yes,No,759217
1,10/01/2016,Credit reporting,,Incorrect information on credit report,Account status,I have outdated information on my credit repor...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",AL,352XX,,Consent provided,Web,10/05/2016,Closed with explanation,Yes,No,2141773
2,10/17/2016,Consumer Loan,Vehicle loan,Managing the loan or lease,,I purchased a new car on XXXX XXXX. The car de...,,"CITIZENS FINANCIAL GROUP, INC.",PA,177XX,Older American,Consent provided,Web,10/20/2016,Closed with explanation,Yes,No,2163100
3,06/08/2014,Credit card,,Bankruptcy,,,,AMERICAN EXPRESS COMPANY,ID,83854,Older American,,Web,06/10/2014,Closed with explanation,Yes,Yes,885638
4,09/13/2014,Debt collection,Credit card,Communication tactics,Frequent or repeated calls,,,"CITIBANK, N.A.",VA,23233,,,Web,09/13/2014,Closed with explanation,Yes,Yes,1027760


## How many consumer complaints are there per state?

In [None]:
data['State'].value_counts()

## What ZIP code has the highest number of complaints?

In [None]:
data['ZIP code'].value_counts()[:1]

## What are the top five products for complaints?

In [None]:
data['Product'].value_counts()[:5]

## What are the top five states for consumer complaints?

In [None]:
data['State'].value_counts()[:5]

## What is potentially misleading about these results? 
* We can solve this problem by normalizing the results against population data.
* Our complaints database doesn't have this information though, so, let's read the information from the file __`data/states.csv`__ into a DataFrame

In [None]:
states = pd.read_csv('data/states.csv')
states

## Investigate the first few rows of the DataFrame

In [None]:
states.head()

## The __`Count`__ column is irrelevant, so...
* Create a new DataFrame with only the 'Abbrev' and 'Population' columns

In [None]:
states = states[['Abbrev', 'Population']]
states

## We're going to want to merge the DataFrames on the two-letter abbreviation
* This is called __`State`__ in the first data set so make sure the second dataset has the correct column names

In [None]:
states.columns = ['State', 'Population']
states

In [None]:
data = pd.merge(data, states, on='State')
data.head()

## Generate a new DataFrame that contains the number of  complaints per state and keeps track of those counts

In [None]:
by_state = pd.DataFrame(data['State'].value_counts().reset_index())
by_state.columns = 'State Count'.split()

In [None]:
by_state

In [None]:
# We need to group by State, which will produce and then count the 
# number of complaints per state, which we can do with the size()
# method. We can use the reset_index() method to give a reasonable
# name to the column that was produced.
complaints_by_state = data.groupby(['State']).size().reset_index(name='Count')
complaints_by_state

## Merge the __`complaints_by_state`__ DataFrame and the states DataFrame on the __`State`__ column
* Since there are U.S. territories in one of the datasets and not the other, we have some NaN results–get rid of those

In [None]:
merged = pd.merge(complaints_by_state, states, on='State').dropna()

## Inspect the first few rows to ensure it worked properly

In [None]:
merged.head()

## Normalize the Population
1. Divide the counts column by the __`Population`__ 
* Store the result in a new columnn called 'population_normalized'

In [None]:
merged['population_normalized'] = merged['Count'] / merged['Population']

## Check out the first few rows of the data to make sure it looks like you expect

In [None]:
merged.head()

## What are the top 10 states for complaints based upon raw counts?

In [None]:
merged.sort_values('Count', ascending=False)[:10]

## What are the top 10 states for complaints normalized by population?

In [None]:
merged.sort_values('population_normalized', ascending=False)[:10]

In [None]:
%store merged