# Chapter 6 - Case Study \#2: Diagnosing Errors

In [1]:
import pandas as pd

import numpy as np
from dateutil.parser import ParserError
import pandas.api.types as ptypes

Please perform calculations on this dataframe about US states (`states`) for the exercises in this chapter.

In [2]:
### !!!! Currently using my Github to host these files -- please switch to Metis-owned S3 bucket !!!!

states = pd.read_csv('https://github.com/kimfetti/Projects/blob/master/Etc/scraped_wiki_states_edit.csv?raw=True')

In [3]:
states.head()

Unnamed: 0,state,date_admitted,population,area_sq_mi,median_household_income
0,Alabama,"December 14, 1819",4903185,52419,"$48,123"
1,Alaska,"January 3, 1959",710249,663268,"$73,181"
2,Arizona,"February 14, 1912",7278717,113990,"$56,581"
3,Arkansas,"June 15, 1836",3017804,53179,"$45,869"
4,California,"September 9, 1850",39512223,163696,"$71,228"


## Exercise 1 - Messy Rows

The information in `states` are organized as one row per US state.  While there are only 50 US states, this dataframe currenly has 55 rows.  What row issues can you find in `states`?  Check for
- Missing states
- Inconsistent states
- Duplicate states

Once you have filtered out the messy rows, save your results as a new dataframe `states_48`.  

_Hint_: You will eventually find information for only 48 states.  

In [4]:
states.shape

(55, 5)

In [11]:
# YOUR CODE HERE
#raise NotImplementedError()
states.drop_duplicates(inplace=True)
states_48 = states.dropna()

states_48.head()

Unnamed: 0,state,date_admitted,population,area_sq_mi,median_household_income
0,Alabama,"December 14, 1819",4903185,52419,"$48,123"
1,Alaska,"January 3, 1959",710249,663268,"$73,181"
2,Arizona,"February 14, 1912",7278717,113990,"$56,581"
3,Arkansas,"June 15, 1836",3017804,53179,"$45,869"
4,California,"September 9, 1850",39512223,163696,"$71,228"


In [12]:
states_48.shape

(48, 5)

In [13]:
assert type(states_48) == pd.DataFrame, "Be sure states_48 is a pandas dataframe."
assert states_48.state.isna().sum() == 0, "Drop rows that are missing state information from state_48."
assert states_48.shape == (48, 5), "states_48 should only have 48 rows and should still have 5 columns."
assert states_48.state.nunique() == 48, "You should find information for 48 unique states."

## Exercise 2 - Two-word States

How many two-word states are there in your cleaned up dataframe of 48 states?  Create two new variables for this exercise:
- `ser_two_word`: a pandas series that contains either
  1. the state's name for one-word states or
  2. the string "TWO WORDS" for all two-word states
- `count_two_word`: an integer of the number of two-word states in this dataset

Make a custom function to map onto the "state" column of `states_48` to create `ser_two_word`.  You may also want to print out the name of each two-word state you find to verify your code is working.  Once `ser_two_word` is created, you may find it easier to then make `count_two_word` based off of it.

In [None]:
# YOUR CODE HERE
#raise NotImplementedError()

ser_two_word = states.state.map(lambda x: str        

In [None]:
ser_two_word[-5:]

In [None]:
count_two_word

In [None]:
assert type(ser_two_word) == pd.Series, "Be sure ser_two_word is a pandas series."
assert isinstance(count_two_word, (int, np.int32, np.int64)), "Be sure count_two_word is an integer."
assert len(ser_two_word) == 48, "ser_two_word should have one value for every state in states_48."
assert 5 < count_two_word < 15, "You should find at least 5 two-word states but fewer than 15."
assert ser_two_word.value_counts().index[0] == "TWO WORDS", "You should replace each two-word state with the string 'TWO WORDS' in ser_two_word."
assert (ser_two_word.iloc[0] == "Alabama") & (ser_two_word.iloc[-1] == 'Wyoming'), 'One-word states should retain their names in ser_two_word, and all states should remain in the same order.'

## Exercise 3 - Messy Dates

Create a new column `states_48` called "date_time_admitted" to hold datetime values for each state's admission date.  

You will encounter a data error if you try to directly convert the "date_admitted" column.  Diagnose this error and use a strategy to correctly convert all admission dates.

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

states_48.head()

In [None]:
states_48.dtypes

In [None]:
assert 'date_time_admitted' in states_48.columns, "Please add a column called 'date_time_admitted' to the states_48 dataframe."
assert ptypes.is_datetime64_any_dtype(states_48.date_time_admitted), "Be sure the date_time_admitted column contains the datetime data type."
assert states_48.date_time_admitted.isna().sum() == 0, "Please convert ALL admission dates -- even those with inconsistencies!"