In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Datasets/yelp_academic_dataset_business.csv')

# Display the first few rows of the dataset
df.head()

AttributeError: partially initialized module 'pandas' has no attribute 'core' (most likely due to a circular import)

In [2]:

# Filter only the required columns: postal_code (zip code) and name (business name)
df_zip_business = df[['postal_code', 'name']]

# Drop rows with missing values in these columns
df_zip_business = df_zip_business.dropna(subset=['postal_code', 'name'])

# Display the first few rows of the updated dataset
df_zip_business.head()


Unnamed: 0,postal_code,name
0,93101,"Abby Rappoport, LAC, CMQ"
1,63123,The UPS Store
2,85711,Target
3,19107,St Honore Pastries
4,18054,Perkiomen Valley Brewery


In [3]:

# List of high-end businesses commonly found in gentrified areas
luxury_businesses = [
    "Starbucks", "Whole Foods", "Trader Joe's", "Sephora", "Apple Store",
    "Lululemon", "Urban Outfitters", "Pottery Barn", "West Elm", "Nordstrom",
    "Anthropologie", "Crate & Barrel", "Williams-Sonoma", "Tesla", "Neiman Marcus",
    "Saks Fifth Avenue", "Bloomingdale's", "Restoration Hardware", "J.Crew", "Madewell", 
    "Chipotle", "Shake Shack", "Sweetgreen", "Warby Parker", "Equinox Gym", "SoulCycle", 
    "Peloton Showrooms", "Blue Bottle Coffee", "Aesop", "Barnes & Noble", "Pressed Juicery", 
    "Sprinkles Cupcakes", "Le Pain Quotidien", "Joe & The Juice", "Bonobos", "Away Luggage", 
    "Allbirds", "Everlane", "Rothy's", "Vineyard Vines", "Flywheel Sports", "Tory Burch", 
    "Chanel", "Glossier", "Drybar", "M.A.C Cosmetics", "Fenty Beauty", "Gucci", "Louis Vuitton", 
    "Coach", "Kate Spade", "Michael Kors", "Burberry", "Tommy Hilfiger", "Ralph Lauren", 
    "Fjällräven", "The North Face", "Patagonia", "Tiffany & Co.", "Bose", "Bang & Olufsen", 
    "Tumi", "L'Occitane", "Kiehl's", "Birchbox", "Eileen Fisher", "Free People", "H&M", "Zara", 
    "Uniqlo", "CB2", "Sur La Table", "Boll & Branch", "Casper", "The Wing", "WeWork", "Blu Dot",
    "Philz Coffee", "Stumptown Coffee Roasters", "La Colombe Coffee Roasters", "Cuyana", 
    "Reformation", "Outdoor Voices", "Fabletics", "Gorjana", "Moleskine", "Muji", "Rag & Bone", 
    "Theory", "Rent the Runway", "Parachute Home", "Brooklinen", "Dr. Martens", "Veja", "Aritzia", 
    "BaubleBar", "Mejuri", "Vrai & Oro", "Dyson", "Bose", "Sonos", "Rituals", "Origins", 
    "Bumble and Bumble", "Bliss Spa", "Fresh", "Bobbi Brown", "Smashbox", "Tatcha", "Too Faced", 
    "Estée Lauder", "Caudalie", "Dermalogica", "Hourglass", "Oribe", "Jo Malone London", "Byredo", 
    "Le Labo"
]

# Create a filter to select rows with luxury businesses
luxury_filter = df_zip_business['name'].str.contains('|'.join(luxury_businesses), case=False, na=False)

# Filter the dataset to get only luxury businesses
df_luxury = df_zip_business[luxury_filter]

df_luxury.head()


Unnamed: 0,postal_code,name
21,46240,Barnes & Noble Booksellers
26,93101,H&M
71,37215,Pottery Barn Kids
88,8102,Fresh Fruits & Salads
101,46240,Saks Fifth Avenue


In [4]:

# Count the number of luxury businesses in each zip code
luxury_counts = df_luxury.groupby('postal_code').size().reset_index(name='luxury_business_count')

# Display the first few rows of the counts
luxury_counts.head()


Unnamed: 0,postal_code,luxury_business_count
0,8002,22
1,8003,1
2,8012,3
3,8016,2
4,8021,3


In [5]:

# Merge the luxury counts with the original dataset of zip codes to get all zip codes (even those with zero luxury businesses)
final_df = df_zip_business[['postal_code']].drop_duplicates().merge(luxury_counts, on='postal_code', how='left')

# Fill NaN values with 0 (representing zip codes with no luxury businesses)
final_df['luxury_business_count'] = final_df['luxury_business_count'].fillna(0).astype(int)

# Filter out zip codes that don't start with a number - gets rid of Canadian zips in dataset
final_df = final_df[final_df['postal_code'].str[0].str.isnumeric()]

# Display the first few rows of the final dataset
final_df.head()


Unnamed: 0,postal_code,luxury_business_count
0,93101,26
1,63123,3
2,85711,6
3,19107,22
4,18054,0


In [6]:

# Save the final dataframe to a CSV file
final_df.to_csv("Cleaned_Datasets/luxury_businesses_by_zip.csv", index=False)
