In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Part 1

##### 1.Scrape data from menuism.com


In [2]:
url = "https://www.menuism.com/restaurant-locations/"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")
    
    chains = [
        "Starbucks",
        "Dunkin Donuts",
        "Peet's Coffee and Tea",
        "Tim Horton's",
        "Panera Bread",
        "Caribou Coffee",
        "Au Bon Pain",
        "The Coffee Bean and Tea Leaf",
        "McDonald's",
    ]

   
    data_dict = {"State Name": [], "Number of Locations": []}

    for chain in chains:
        elements = soup.find_all("span", {"class": "chainname"}, text=chain)
        total_locations = 0

        for element in elements:
            location_count = element.find_next("span", {"class": "num-locations"})
            if location_count:
                total_locations += int(location_count.text.strip())
                
        data_dict["State Name"].append(chain)
        data_dict["Number of Locations"].append(total_locations)

    state_locations = pd.DataFrame(data_dict)
else:
    print("Failed to retrieve the webpage")

Failed to retrieve the webpage


##### 2.stateabb() function

In [None]:
def stateabb(state_name):
    state_abbreviations=pd.read_csv()
    abbreviation_series=state_abbreviations[state_abbreviations['State']==state_name]['State Abbreviation']

    if not abbreviation_series.empty:
        return abbreviation_series.values[0]
    else:
        return "Not a State"

##### 3. Parse, Merge, and Tidy

In [None]:
data_example = {
    'Starbucks': {'California': 500, 'Texas': 300, ...},
    'Dunkin Donuts': {'California': 100, 'Texas': 200, ...},
    ...
}

dfs = [pd.DataFrame.from_dict({chain: locations, 'State Name': locations.keys()}).set_index('State Name') for chain, locations in data_example.items()]

df_merged = pd.concat(dfs, axis=1)


### Part 2

##### 4.Scrape state names and populations from wikipedia

In [3]:
url = "https://simple.wikipedia.org/wiki/List_of_U.S._states_by_population"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")

    table = soup.find("table", {"class": "wikitable"})

    state_names = []
    populations = []

    for row in table.find_all("tr")[1:]: 
        columns = row.find_all("td")
        if len(columns) >= 2:
            state_name = columns[2].text.strip()
            population = columns[3].text.strip()

            state_names.append(state_name)
            populations.append(population)

    data = {"State Name": state_names, "Population": populations}
    state_pop = pd.DataFrame(data)

else:
    print("Failed to retrieve the webpage")
state_pop=state_pop.drop([56,57,58,59])
state_pop['Population']=state_pop['Population'].str.replace(",","",regex=True).astype(int)
state_pop


In [None]:
# The finacial data is made up 
financial_df = pd.DataFrame(financial_data)

financial_df
RESULT

#                          Chain  Revenue ($M)
#0                     Starbucks         26800
#1                 Dunkin Donuts          1200
#2         Peet's Coffee and Tea           700
#3                  Tim Horton's          3000
#4                  Panera Bread          2500
#5                Caribou Coffee           350
#6                   Au Bon Pain           400
#7  The Coffee Bean and Tea Leaf           310
# 8                    McDonald's         21000

###

In [None]:
region_mapping = {
    "Northeast": ["Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", 
                  "Vermont", "New Jersey", "New York", "Pennsylvania"],
    "Midwest": ["Illinois", "Indiana", "Michigan", "Ohio", "Wisconsin", "Iowa", "Kansas", 
                "Minnesota", "Missouri", "Nebraska", "North Dakota", "South Dakota"],
    "South": ["Delaware", "Florida", "Georgia", "Maryland", "North Carolina", "South Carolina", 
              "Virginia", "West Virginia", "Alabama", "Kentucky", "Mississippi", "Tennessee", 
              "Arkansas", "Louisiana", "Oklahoma", "Texas"],
    "West": ["Arizona", "Colorado", "Idaho", "Montana", "Nevada", "New Mexico", "Utah", 
             "Wyoming", "Alaska", "California", "Hawaii", "Oregon", "Washington"]
}

def get_region(state):
    for region, states in region_mapping.items():
        if state in states:
            return region
    return "Unknown"  

# The region column
states_data["Region"] = states_data["State"].apply(get_region)

states_data[["State", "Region"]].head()

In [None]:
merged_df = states_data_cleaned.assign(key=1).merge(financial_df.assign(key=1), on='key').drop('key', axis=1)

merged_df["Region"] = merged_df["State Name"].apply(get_region)

merged_df.head()

#### Part 3 

##### Analyze

a. Are some of these chains more prevalent in certain states than others? Possibly despite having less stores overall? Same questions for regions instead of states?

In [3]:
# I am assuming this is how I would approach it: 
statewise_counts = merged_df.groupby(['State Abbreviation', 'Chain']).size().unstack().fillna(0)

regionwise_counts = merged_df.groupby(['Region', 'Chain']).size().unstack().fillna(0)

overall_counts = merged_df['Chain'].value_counts()

statewise_counts, regionwise_counts, overall_counts

NameError: name 'merged_df' is not defined

b. How does your chosen financial metric change by state and region for each chain? For example, having 5 stores in California is very different from having 5 stores in Wyoming.

In [None]:
# I am assuming this is how I would approach it: 
statewise_revenue = merged_df.groupby('State Abbreviation')['Revenue ($M)'].sum()

regionwise_revenue = merged_df.groupby('Region')['Revenue ($M)'].sum()

statewise_data = pd.merge(statewise_revenue, states_data_cleaned[['State Abbreviation', 'Population']], on="State Abbreviation", how="left")

correlation = statewise_data['Population'].corr(statewise_data['Revenue ($M)'])

statewise_revenue, regionwise_revenue, correlation

#### Part 4

##### Automate: 

In [4]:
# My best effort in creating this mega function: 

def scrape_menuism_data(url):
    """
    Scrape the menuism webpage for state names and corresponding number of store locations.
    
    Parameters:
    - url (str): The menuism URL for a specific coffee chain.
    
    Returns:
    - DataFrame: A DataFrame containing state abbreviation, location count, and company name.
    """
    response = requests.get(url)
    data_dict = {"State Abbreviation": [], "Location Count": [], "Company Name": []}
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        # Extracting company name from the URL
        company_name = url.split("/")[-2].replace("-", " ").title()
        
        elements = soup.find_all("span", {"class": "chainname"}, text=company_name)

        for element in elements:
            location_count = element.find_next("span", {"class": "num-locations"})
            state_name = element.find_next("span", {"class": "statename"}).text.strip()
            
            if location_count and state_name:
                data_dict["State Abbreviation"].append(stateabb(state_name))
                data_dict["Location Count"].append(int(location_count.text.strip()))
                data_dict["Company Name"].append(company_name)
                
    else:
        print("Failed to retrieve the webpage")

    return pd.DataFrame(data_dict)
