#### Script used to clean and Sort data

### 02- Data Wrangling for USA population data

In [1]:
# Use this cell to set up import statements for all of the packages 
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

# 'magic word' so that your visualizations are plotted
%matplotlib inline

In [2]:
# Load USA population data:
population_df = pd.read_csv('us_census_population.csv')
# Perform operations to inspect data
print('- census population [2010-2019] size : ', population_df.shape)
population_df.head()

- census population [2010-2019] size :  (536, 13)


Unnamed: 0,state,year,State Code,Region,population,population ratio,population region ratio,area,area density,land,land density,water,water density
0,Alabama,2010,AL,South,4785437.0,0.015471,0.041661,135767.0,35.2474239,131171.0,34.05422407,4597.0,1.193459439
1,Alaska,2010,AK,West,713910.0,0.002308,0.009902,1723337.0,0.41426024,1477953.0,0.355274195,245384.0,0.058986046
2,Arizona,2010,AZ,West,6407172.0,0.020714,0.088865,295234.0,21.70201264,294207.0,21.62652009,1026.0,0.07541904
3,Arkansas,2010,AR,South,2921964.0,0.009446,0.025438,137732.0,21.21485203,134771.0,20.75876937,2961.0,0.45608266
4,California,2010,CA,West,37319502.0,0.120649,0.517604,423967.0,88.02454436,403466.0,83.7681018,20501.0,4.256442563


In [3]:
# check which columns have missing values with info()
population_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536 entries, 0 to 535
Data columns (total 13 columns):
state                       536 non-null object
year                        536 non-null int64
State Code                  510 non-null object
Region                      510 non-null object
population                  521 non-null float64
population ratio            531 non-null float64
population  region ratio    531 non-null float64
area                        528 non-null float64
area density                531 non-null object
land                        528 non-null float64
land density                531 non-null object
water                       528 non-null float64
water density               531 non-null object
dtypes: float64(6), int64(1), object(6)
memory usage: 54.5+ KB


In [4]:
# check for duplicates in the data
print('- Number of duplicated rows:',sum(population_df.duplicated()),'\n')
# count the number of missing data in each columon
print('- The number of missing data in each columns:')
population_df.isnull().sum()

- Number of duplicated rows: 0 

- The number of missing data in each columns:


state                        0
year                         0
State Code                  26
Region                      26
population                  15
population ratio             5
population  region ratio     5
area                         8
area density                 5
land                         8
land density                 5
water                        8
water density                5
dtype: int64

#### drop the years before 2010 and after 2019 not enough poplution data exit 

In [5]:
population_df.year.unique()

array([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])

#### Drop District of Columbia, and U.S. territories not enough data.
USA colonies : Guam, Puerto Rico, District Columbia, Virgin Islands, and Mariana Islands 

In [6]:
colonies = ['Puerto Rico', 'Mariana Islands', 'Virgin Islands','Guam' , 'District of Columbia']
population_df = population_df[~population_df.state.isin(colonies)]
population_df['state'].nunique()

50

In [9]:
# count the number of missing data in each columon

In [7]:
population_df.isnull().sum()

state                       0
year                        0
State Code                  0
Region                      0
population                  0
population ratio            0
population  region ratio    0
area                        0
area density                0
land                        0
land density                0
water                       0
water density               0
dtype: int64

#### convert data type to appropate data type

In [8]:
population_df.iloc[:,1] = population_df.iloc[:,1].astype(int) 
population_df.iloc[:,4] = population_df.iloc[:,4].astype(int) 
population_df.iloc[:,7] = population_df.iloc[:,7].astype(int)
population_df.iloc[:,8] = population_df.iloc[:,8].astype(float)
population_df.iloc[:,9] = population_df.iloc[:,9].astype(int)
population_df.iloc[:,10] = population_df.iloc[:,10].astype(float)
population_df.iloc[:,11] = population_df.iloc[:,11].astype(int)
population_df.iloc[:,12] = population_df.iloc[:,12].astype(float)

In [9]:
# strip and convert to string 
population_df['state'] = population_df['state'].astype(str)
population_df['state'].nunique()

50

#### confirm your correction with info()

In [10]:
population_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 535
Data columns (total 13 columns):
state                       500 non-null object
year                        500 non-null int64
State Code                  500 non-null object
Region                      500 non-null object
population                  500 non-null int64
population ratio            500 non-null float64
population  region ratio    500 non-null float64
area                        500 non-null int64
area density                500 non-null float64
land                        500 non-null int64
land density                500 non-null float64
water                       500 non-null int64
water density               500 non-null float64
dtypes: float64(5), int64(5), object(3)
memory usage: 54.7+ KB


In [11]:
population_df.head()

Unnamed: 0,state,year,State Code,Region,population,population ratio,population region ratio,area,area density,land,land density,water,water density
0,Alabama,2010,AL,South,4785437,0.015471,0.041661,135767,35.247424,131171,34.054224,4597,1.193459
1,Alaska,2010,AK,West,713910,0.002308,0.009902,1723337,0.41426,1477953,0.355274,245384,0.058986
2,Arizona,2010,AZ,West,6407172,0.020714,0.088865,295234,21.702013,294207,21.62652,1026,0.075419
3,Arkansas,2010,AR,South,2921964,0.009446,0.025438,137732,21.214852,134771,20.758769,2961,0.456083
4,California,2010,CA,West,37319502,0.120649,0.517604,423967,88.024544,403466,83.768102,20501,4.256443


In [12]:
population_df.to_csv('population.csv', index = False)