# Scope analysis

In this notebook we will scrap wikipedia to select the top 5 most populated states in USA so that we can base our project on those states.

In [68]:
import pandas as pd 
import requests
from bs4 import BeautifulSoup
from io import StringIO

In [69]:
# Wikipedia URL
url = "https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population"

response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

# after checking the web, we can see we need the wikitable class to call the table
table = soup.find('table', {'class': 'wikitable'})

# lets make a stringIO object to read the html
html_string = str(table)
html_file = StringIO(html_string)

table_df = pd.read_html(html_file)[0]

table_df

Unnamed: 0_level_0,State or territory,Census population[8][a],Census population[8][a],"Change, 2010–2020[8][a]","Change, 2010–2020[8][a]",House seats[b],House seats[b],"Pop. per elec. vote, 2020[c]",Pop. per seat (2020)[a],% US (2020),% EC (2020)
Unnamed: 0_level_1,State or territory,"July 1, 2023 (est.)","April 1, 2020",%,Abs.,Seats,%,"Pop. per elec. vote, 2020[c]",Pop. per seat (2020)[a],% US (2020),% EC (2020)
0,,,,,,,,,,,
1,California,38965193.0,39538223.0,6.13%,2284267,52,11.95%,732189,760350,11.800%,10.04%
2,Texas,30503301.0,29145505.0,15.91%,3999944,38,8.74%,728638,766987,8.698%,7.43%
3,Florida,22610726.0,21538187.0,14.56%,2736877,28,6.44%,717940,769221,6.428%,5.58%
4,New York,19571216.0,20201249.0,4.25%,823147,26,5.98%,721473,776971,6.029%,5.20%
...,...,...,...,...,...,...,...,...,...,...,...
56,Northern Mariana Islands[13],,47329.0,−12.16%,"−6,554",1*,—,—,—,0.014%,—
57,Contiguous United States,332746351.0,329260619.0,7.36%,22585613,432 (+1*),99.31%,627163,757745,98.265%,98.70%
58,The 50 states,334235923.0,330759736.0,7.34%,22615921,435,100%,621729,755796,98.713%,99.44%
59,The 50 states and D.C.,334914895.0,331449281.0,7.35%,22703743,435 (+1*),100%,619531,—,98.918%,100%


In [70]:
# Selecting table columns
table_df = table_df[1:57][['State or territory','Census population[8][a]']]

# managing multilevel columns
table_df.columns.names = ['1','2']

table_df = table_df.droplevel('1',axis=1)

table_df.drop(columns='July 1, 2023 (est.)',inplace=True)

table_df.head()

2,State or territory,"April 1, 2020"
1,California,39538223.0
2,Texas,29145505.0
3,Florida,21538187.0
4,New York,20201249.0
5,Pennsylvania,13002700.0


In [72]:
# renaming columns and sorting
table_df.columns = ['State','2020 Population']

table_df.sort_values('2020 Population',ascending=False).head()

Unnamed: 0,State,2020 Population
1,California,39538223.0
2,Texas,29145505.0
3,Florida,21538187.0
4,New York,20201249.0
5,Pennsylvania,13002700.0


In [76]:
# lets check the % covered to see if we have a representative sample
total_population = table_df['2020 Population'].sum()

table_df['% population'] = round(table_df['2020 Population']/total_population * 100,1)

table_df.head(10)

Unnamed: 0,State,2020 Population,% population
1,California,39538223.0,11.8
2,Texas,29145505.0,8.7
3,Florida,21538187.0,6.4
4,New York,20201249.0,6.0
5,Pennsylvania,13002700.0,3.9
6,Illinois,12812508.0,3.8
7,Ohio,11799448.0,3.5
8,Georgia,10711908.0,3.2
9,North Carolina,10439388.0,3.1
10,Michigan,10077331.0,3.0


In [77]:
# selecting top 5
selected = table_df.loc[:5]
selected

Unnamed: 0,State,2020 Population,% population
1,California,39538223.0,11.8
2,Texas,29145505.0,8.7
3,Florida,21538187.0,6.4
4,New York,20201249.0,6.0
5,Pennsylvania,13002700.0,3.9


In [85]:
per = selected['% population'].sum()
tots = selected['2020 Population'].sum()

print(f'Total population covered with the selected states: {tots:,.0f}\n Percentage of USA population: {per}%')

Total population covered with the selected states: 123,425,864
 Percentage of USA population: 36.8%
