## Parts 1, 2



In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

response = requests.get("https://worldpopulationreview.com/state-rankings/starbucks-stores-by-state")
soup = BeautifulSoup(response.content, "html.parser")

table = soup.find("table")
headers = [header.text.strip() for header in table.find_all("th")]
headers = headers[:3]
rows = []
state_names = []

for row in table.find_all("tr")[1:]:
    state_name = row.find("th").find("a").text.strip()
    state_names.append(state_name)
    cells = [cell.text.strip() for cell in row.find_all("td")]
    rows.append(cells)

df_s = pd.DataFrame(rows, columns=headers)
df_s.index = state_names

In [2]:
df_s = df_s.rename(columns={'State': 'starbucks_locations'})
df_s = df_s.drop(['Starbucks Stores 2023', 'Starbucks Stores 2021'], axis=1)

In [3]:
df_s['State'] = df_s.index
df_s = df_s.reset_index(drop=True)

In [4]:
df_s

Unnamed: 0,starbucks_locations,State
0,3080,California
1,1346,Texas
2,844,Florida
3,741,Washington
4,692,New York
5,677,Illinois
6,548,Arizona
7,495,Colorado
8,491,Ohio
9,489,Virginia


In [5]:
response1 = requests.get("https://worldpopulationreview.com/state-rankings/dunkin-donuts-by-state")
soup1 = BeautifulSoup(response1.content, "html.parser")

table1 = soup1.find("table")
headers1 = [header.text.strip() for header in table.find_all("th")]
headers1 = headers1[:2]
rows1 = []
state_names1 = []

for row in table1.find_all("tr")[1:]:
    state_name1 = row.find("th").find("a").text.strip()
    state_names1.append(state_name1)
    cells1 = [cell.text.strip() for cell in row.find_all("td")]
    rows1.append(cells1)

df_d = pd.DataFrame(rows1, columns=headers1)
df_d.index = state_names1

In [6]:
df_d = df_d.rename(columns={'Starbucks Stores 2023': 'dunkin_locations'})
df_d = df_d.drop(['State'], axis=1)

In [7]:
df_d['State'] = df_d.index
df_d = df_d.reset_index(drop=True)

In [8]:
df = pd.merge(df_s, df_d, on='State', how='outer')

In [9]:
df_d

Unnamed: 0,dunkin_locations,State
0,1414.0,New York
1,1068.0,Massachusetts
2,883.0,Florida
3,866.0,New Jersey
4,692.0,Illinois
5,629.0,Pennsylvania
6,480.0,Connecticut
7,303.0,Maryland
8,265.0,Georgia
9,245.0,Ohio


## Parts 4, 5, 6

In [10]:
response2 = requests.get("https://simple.wikipedia.org/wiki/List_of_U.S._states_by_population")
soup2 = BeautifulSoup(response2.content, "html.parser")
table2 = soup2.find("table", attrs={
    "class": ["wikitable", "sortable"],
    "style": "width:100%; text-align:center;"
})
headers2 = [th.text.strip() for th in table2.find_all("th")]
rows2 = []

for tr in table2.find_all("tr")[1:]:
    cells2 = []
    for td in tr.find_all("td"):
        if td.find("a"):
            cells2.append(td.find("a").text.strip())
        else:
            cells2.append(td.text.strip())
    rows2.append(cells2)

df_table2 = pd.DataFrame(rows2, columns=headers2)

In [11]:
df_combined = pd.merge(df, df_table2, on='State', how='outer')
df_combined = df_combined.loc[:, ['State', 'starbucks_locations', 'dunkin_locations', 'Census population, April 1, 2020[1][2]']]

In [12]:
df_combined = df_combined.rename(columns={'Census population, April 1, 2020[1][2]': 'population'})

In [13]:
df_combined['Dunkin_stock_price'] = 106.48
df_combined['starbucks_stock_price'] = 98.02

In [14]:
regions = {
    'West': ['California', 'Nevada', 'Oregon', 'Washington', 'Hawaii', 'Alaska', 'Arizona', 'Colorado', 'Idaho', 'Montana', 'New Mexico', 'Utah', 'Wyoming'],
    'South': ['Texas', 'Florida', 'Georgia', 'Alabama', 'Delware', 'Maryland', 'North Carolina', 'South Carolina', 'Virginia', 'Kentucky', 'Mississippi', 'Tennessee', 'Arkansas', 'Louisiana', 'Oklahoma'],
    'Northeast': ['New York', 'New Jersey', 'Pennsylvania', 'Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Isalnd', 'Vermont'],
    'Midwest': ['Illinois,' 'Indiana', 'Michigan', 'Ohio', 'Wisconsin', 'Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota', 'South Dakota']
}

def get_region(state):
    for region, states in regions.items():
        if state in states:
            return region
    return 'Other'

df_combined['region'] = df_combined['State'].apply(get_region)

In [15]:
df_combined

Unnamed: 0,State,starbucks_locations,dunkin_locations,population,Dunkin_stock_price,starbucks_stock_price,region
0,Alabama,85.0,59.0,5024279,106.48,98.02,South
1,Alaska,49.0,0.0,733391,106.48,98.02,West
2,American Samoa,,,49710,106.48,98.02,Other
3,Arizona,548.0,102.0,7151502,106.48,98.02,West
4,Arkansas,55.0,9.0,3011524,106.48,98.02,South
5,California,3080.0,134.0,39538223,106.48,98.02,West
6,Colorado,495.0,43.0,5773714,106.48,98.02,West
7,Connecticut,123.0,480.0,3605944,106.48,98.02,Northeast
8,Contiguous United States,,,329260619,106.48,98.02,Other
9,Delaware,25.0,66.0,989948,106.48,98.02,Other


## Part 7 (code below)
- Are some of these chains more prevalent in certain states than others? Possibly despite having less stores overall? Same questions for regions instead of states.

Almost always there are more Dunkin Donuts in the Northeast states, and more Starbucks in the West and South. The Midwest was less consistent. States that stood out were California with 3,080 Starbucks locations and 134 Dunkin Donuts locations, and New York and Massechusetts with 1,414 and 1,068 Dunkins respectively, and 692 and 273 Starbucks respectively.

- Does the distribution of each chain’s stores match population distribution, by both state/region?

Total locations match population distribution, but regionality is important when it comes to distribution for each chain. For example, California has 40 million people and over 3,000 total locations and New York has over 20 millions people and over 2,000 total locations, while Wyoming has 600,000 people and 24 total locations. But even though California has so many people, they only have 134 Dunkin locations.

- Do the financial data match what you’d expect based on the number and locations of the stores? Why or why not?

I chose stock price. This data does not give enough information to make conclusions from number of stores.

## Part 8

In [16]:
def scrape_company_locations(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table")
    headers = [header.text.strip() for header in table.find_all("th")]
    headers = headers[:2]
    rows = []
    state_names = []

    for row in table.find_all("tr")[1:]:
        state_name = row.find("th").find("a").text.strip()
        state_names.append(state_name)
        cells = [cell.text.strip() for cell in row.find_all("td")]
        rows.append(cells)

    df = pd.DataFrame(rows, columns=headers)
    df.index = state_names
    df = df.rename(columns={'State': 'locations'})
    df['State'] = df.index
    df = df.reset_index(drop=True)

    return df

In [17]:
scrape_company_locations("https://worldpopulationreview.com/state-rankings/in-n-out-by-state")

Unnamed: 0,locations,In-N-Out Locations 2024,State
0,276,267.0,California
1,43,41.0,Texas
2,35,34.0,Arizona
3,23,22.0,Nevada
4,13,12.0,Utah
5,12,7.0,Colorado
6,4,4.0,Oregon
7,1,,Idaho
8,0,,Alabama
9,0,,Alaska
