###  美国各州人口数据分析

首先导入文件，并查看数据样本

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### 加载数据
- state-abbrevs.csv
- state-areas.csv
- state-population.csv

In [2]:
abb = pd.read_csv('./data/12_美国人口数据分析项目/state-abbrevs.csv')
areas = pd.read_csv('./data/12_美国人口数据分析项目/state-areas.csv')
pop = pd.read_csv('./data/12_美国人口数据分析项目/state-population.csv')

In [3]:
display(areas.head(), areas.shape)
display(abb.head(), abb.shape)
display(pop.head(), pop.shape)

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


(52, 2)

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


(51, 2)

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0


(2544, 4)

#### 合并population与abbrevs两个DataFrame
- 分别依据state/region列和abbreviation列来合并。
- 为了保留所有信息，使用外合并。merge()

In [6]:
pop_abb = pop.merge(abb,how='outer',left_on='state/region',right_on='abbreviation')
pop_abb.head()

Unnamed: 0,state/region,ages,year,population,state,abbreviation
0,AK,total,1990,553290.0,Alaska,AK
1,AK,under18,1990,177502.0,Alaska,AK
2,AK,total,1992,588736.0,Alaska,AK
3,AK,under18,1991,182180.0,Alaska,AK
4,AK,under18,1992,184878.0,Alaska,AK


#### 去除abbreviation的那一列（axis=1）

In [7]:
pop_abb.drop(columns=['abbreviation'], inplace=True)
pop_abb.head()

Unnamed: 0,state/region,ages,year,population,state
0,AK,total,1990,553290.0,Alaska
1,AK,under18,1990,177502.0,Alaska
2,AK,total,1992,588736.0,Alaska
3,AK,under18,1991,182180.0,Alaska
4,AK,under18,1992,184878.0,Alaska


#### 查看存在缺失数据的列。
- 使用.isnull().any()

In [9]:
pop_abb.isnull().any()

state/region    False
ages            False
year            False
population       True
state            True
dtype: bool

#### 查看缺失数据
- 根据数据是否缺失情况显示数据，如果缺失为True，那么显示

In [12]:
# 存在空值的行
cond = pop_abb.isnull().any(axis=1)
# 查看缺失的数据
pop_abb.loc[cond]

Unnamed: 0,state/region,ages,year,population,state
1872,PR,under18,1990,,
1873,PR,total,1990,,
1874,PR,total,1991,,
1875,PR,under18,1991,,
1876,PR,total,1993,,
...,...,...,...,...,...
2203,USA,total,2010,309326295.0,
2204,USA,under18,2011,73902222.0,
2205,USA,total,2011,311582564.0,
2206,USA,under18,2012,73708179.0,


#### 找到有哪些state/region使得state的值为NaN
- 使用unique()查看非重复值

In [15]:
cond2 = pop_abb['state'].isnull()
pop_abb.loc[cond2]['state/region'].unique()

array(['PR', 'USA'], dtype=object)

#### 填充state这一列的所有NaN
- 找到的这些state/region的state项补上正确的值

> PR  => Puerto Rico 
>
> USA => United State

In [16]:
pop_abb.head()

Unnamed: 0,state/region,ages,year,population,state
0,AK,total,1990,553290.0,Alaska
1,AK,under18,1990,177502.0,Alaska
2,AK,total,1992,588736.0,Alaska
3,AK,under18,1991,182180.0,Alaska
4,AK,under18,1992,184878.0,Alaska


In [29]:
cond3 = pop_abb['state/region'] == 'PR'
pop_abb.loc[cond3,'state'] = 'Puerto Rico'

In [30]:
cond4 = pop_abb['state/region'] == 'USA'
pop_abb.loc[cond4,'state'] = ' United State'

In [32]:
# 再次查看存在state为空的行数据
cond5 = pop_abb['state'].isnull()
pop_abb.loc[cond5]['state/region'].unique()

array([], dtype=object)

#### 合并各州面积数据areas，使用左合并。

In [33]:
display(areas.head(),pop_abb.head())

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


Unnamed: 0,state/region,ages,year,population,state
0,AK,total,1990,553290.0,Alaska
1,AK,under18,1990,177502.0,Alaska
2,AK,total,1992,588736.0,Alaska
3,AK,under18,1991,182180.0,Alaska
4,AK,under18,1992,184878.0,Alaska


In [35]:
pop_add_areas = pop_abb.merge(areas,how='left')
pop_add_areas.sample(5)

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
152,AZ,total,2009,6343154.0,Arizona,114006.0
802,KS,under18,1996,696298.0,Kansas,82282.0
953,MA,total,2013,6692824.0,Massachusetts,10555.0
252,CO,total,1996,3919972.0,Colorado,104100.0
358,DC,total,2001,574504.0,District of Columbia,68.0


#### 继续寻找存在缺失数据的列

In [36]:
pop_add_areas.isnull().any()

state/region     False
ages             False
year             False
population        True
state            False
area (sq. mi)     True
dtype: bool

#### 我们会发现area(sq.mi)这一列有缺失数据，为了找出是哪一行，我们需要找出是哪个state没有数据

In [43]:
cond6 = pop_add_areas['area (sq. mi)'].isnull()
pop_add_areas.loc[cond6,'state'].unique()

array([' United State'], dtype=object)

#### 去除含有缺失数据的行

In [49]:
cond7 = pop_add_areas.notnull().all(axis=1)
pop_add_areas2 = pop_add_areas.loc[cond7]
pop_add_areas2

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AK,total,1990,553290.0,Alaska,656425.0
1,AK,under18,1990,177502.0,Alaska,656425.0
2,AK,total,1992,588736.0,Alaska,656425.0
3,AK,under18,1991,182180.0,Alaska,656425.0
4,AK,under18,1992,184878.0,Alaska,656425.0
...,...,...,...,...,...,...
2539,WY,under18,1993,137458.0,Wyoming,97818.0
2540,WY,total,1991,459260.0,Wyoming,97818.0
2541,WY,under18,1991,136720.0,Wyoming,97818.0
2542,WY,under18,1990,136078.0,Wyoming,97818.0


#### 再查看数据是否缺失

In [50]:
pop_add_areas2.isnull().any()

state/region     False
ages             False
year             False
population       False
state            False
area (sq. mi)    False
dtype: bool

#### 找出2010年的全民人口数据
- df.query(查询语句)

In [51]:
pop_add_areas2.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AK,total,1990,553290.0,Alaska,656425.0
1,AK,under18,1990,177502.0,Alaska,656425.0
2,AK,total,1992,588736.0,Alaska,656425.0
3,AK,under18,1991,182180.0,Alaska,656425.0
4,AK,under18,1992,184878.0,Alaska,656425.0


In [54]:
df2010 = pop_add_areas2.query('year==2010 and ages=="total"')
df2010

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
43,AK,total,2010,713868.0,Alaska,656425.0
51,AL,total,2010,4785570.0,Alabama,52423.0
141,AR,total,2010,2922280.0,Arkansas,53182.0
149,AZ,total,2010,6408790.0,Arizona,114006.0
197,CA,total,2010,37333601.0,California,163707.0
283,CO,total,2010,5048196.0,Colorado,104100.0
293,CT,total,2010,3579210.0,Connecticut,5544.0
341,DC,total,2010,605125.0,District of Columbia,68.0
427,DE,total,2010,899711.0,Delaware,1954.0
475,FL,total,2010,18846054.0,Florida,65758.0


#### 以state列作为新的行索引
- 使用set_index

In [56]:
df2010.set_index('state',inplace=True)
df2010

Unnamed: 0_level_0,state/region,ages,year,population,area (sq. mi)
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alaska,AK,total,2010,713868.0,656425.0
Alabama,AL,total,2010,4785570.0,52423.0
Arkansas,AR,total,2010,2922280.0,53182.0
Arizona,AZ,total,2010,6408790.0,114006.0
California,CA,total,2010,37333601.0,163707.0
Colorado,CO,total,2010,5048196.0,104100.0
Connecticut,CT,total,2010,3579210.0,5544.0
District of Columbia,DC,total,2010,605125.0,68.0
Delaware,DE,total,2010,899711.0,1954.0
Florida,FL,total,2010,18846054.0,65758.0


#### 计算人口密度density

In [57]:
df2010.head()

Unnamed: 0_level_0,state/region,ages,year,population,area (sq. mi)
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alaska,AK,total,2010,713868.0,656425.0
Alabama,AL,total,2010,4785570.0,52423.0
Arkansas,AR,total,2010,2922280.0,53182.0
Arizona,AZ,total,2010,6408790.0,114006.0
California,CA,total,2010,37333601.0,163707.0


In [60]:
df2010.loc[:,'density'] = df2010['population'] / df2010['area (sq. mi)']
df2010.head()

Unnamed: 0_level_0,state/region,ages,year,population,area (sq. mi),density
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alaska,AK,total,2010,713868.0,656425.0,1.087509
Alabama,AL,total,2010,4785570.0,52423.0,91.287603
Arkansas,AR,total,2010,2922280.0,53182.0,54.948667
Arizona,AZ,total,2010,6408790.0,114006.0,56.214497
California,CA,total,2010,37333601.0,163707.0,228.051342


#### 排序，并找出人口密度最高的五个州sort_values()

In [65]:
# 降序排序，并输出前五条数据
df2010.sort_values('density',ascending=False).head()

Unnamed: 0_level_0,state/region,ages,year,population,area (sq. mi),density
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
District of Columbia,DC,total,2010,605125.0,68.0,8898.897059
Puerto Rico,PR,total,2010,3721208.0,3515.0,1058.665149
New Jersey,NJ,total,2010,8802707.0,8722.0,1009.253268
Rhode Island,RI,total,2010,1052669.0,1545.0,681.339159
Connecticut,CT,total,2010,3579210.0,5544.0,645.600649


#### 排序，找出人口密度最低的五个州

In [64]:
# 升序排序，并输出前五条数据
df2010.sort_values('density',ascending=True).head()

Unnamed: 0_level_0,state/region,ages,year,population,area (sq. mi),density
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alaska,AK,total,2010,713868.0,656425.0,1.087509
Wyoming,WY,total,2010,564222.0,97818.0,5.768079
Montana,MT,total,2010,990527.0,147046.0,6.736171
North Dakota,ND,total,2010,674344.0,70704.0,9.537565
South Dakota,SD,total,2010,816211.0,77121.0,10.583512
