In [1]:
import pandas as pd
import numpy as np
import os
import sys
from pathlib import Path
import re 

In [2]:
BASE_DIR = Path().resolve().parent.parent

In [3]:
path = '/Users/ericklopez/desktop/django_gun/empirical/data/processed/starbucks_location_processed3.csv'

In [4]:
df = pd.read_csv(path)

In [5]:
def extract_first_last(name: str) -> tuple[str, str]:
    parts = name.strip().split()
    if len(parts) >= 2:
        first_name = parts[0].title()
        last_name = parts[-1].title()
        return first_name, last_name
    elif len(parts) == 1:
        return parts[0].title(), None
    else:
        return None, None

In [6]:
df[['first_name', 'last_name']] = df['review_author'].apply(extract_first_last).apply(pd.Series)

In [7]:
def parse_subcategory_ratings(s):
    if not isinstance(s, str):
        return pd.Series([None, None, None], index=['food_rating', 'service_rating', 'atmosphere_rating'])
    
    food = re.search(r'Food:(\d+)', s)
    service = re.search(r'Service:(\d+)', s)
    atmosphere = re.search(r'Atmosphere:(\d+)', s)

    return pd.Series([
        int(food.group(1)) if food else None,
        int(service.group(1)) if service else None,
        int(atmosphere.group(1)) if atmosphere else None
    ], index=['food_rating', 'service_rating', 'atmosphere_rating'])

In [8]:
ratings = df['category_ratings'].apply(parse_subcategory_ratings)


In [9]:
df[['food_rating', 'service_rating', 'atmosphere_rating']] = ratings.astype('Int64')

In [10]:
def extract_address_parts(address: str) -> pd.Series:
    pattern = r'^(.*),\s*(.*),\s*([A-Z]{2})\s*(\d{5})$'
    match = re.match(pattern, address.strip()) if isinstance(address, str) else None
    if match:
        street, city, state, zip_code = match.groups()
        return pd.Series({
            'street': street.strip().upper(),
            'city': city.strip().upper(),
            'state': state.strip().upper(),
            'zip': zip_code.strip()
        })
    else:
        return pd.Series({'street': None, 'city': None, 'state': None, 'zip': None})

In [11]:
df[['street', 'city', 'state', 'zip']] = df['business_address'].apply(extract_address_parts)


In [12]:
df.drop(columns= ['review_author', 'business_address', 'category_ratings'],inplace = True)

In [None]:
processed_file_path = "/Users/ericklopez/Desktop/django_gun/empirical/data/final/starbucks_location_final3.csv"


In [None]:
df.to_csv(path_or_buf=processed_file_path)

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,review_date,review_rating,review_content,first_name,last_name,food_rating,service_rating,atmosphere_rating,street,city,state,zip
0,0,2025-01-10,4,"We were traveling from Miami to Tampa, and dur...",Nishant,Narula,4.0,4,4.0,3036 SW MARTIN DOWNS BLVD,PALM CITY,FL,34990
1,1,2025-03-11,1,The only reason why I am leaving a one star re...,Jessica,Morgan,,1,,3036 SW MARTIN DOWNS BLVD,PALM CITY,FL,34990
2,2,2025-02-09,1,"The worst customer service I ever experienced,...",Alvorous,,2.0,1,3.0,3036 SW MARTIN DOWNS BLVD,PALM CITY,FL,34990
3,3,2025-01-10,5,By far the best syarbucks in the area. I go ou...,Diana,Pitts,5.0,5,5.0,3036 SW MARTIN DOWNS BLVD,PALM CITY,FL,34990
4,4,2025-01-10,2,I ordered a Water and a cake pop and it took 2...,Drew,Weil,3.0,2,2.0,3036 SW MARTIN DOWNS BLVD,PALM CITY,FL,34990


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         380 non-null    int64 
 1   review_date        380 non-null    object
 2   review_rating      380 non-null    int64 
 3   review_content     260 non-null    object
 4   first_name         380 non-null    object
 5   last_name          359 non-null    object
 6   food_rating        58 non-null     Int64 
 7   service_rating     73 non-null     Int64 
 8   atmosphere_rating  68 non-null     Int64 
 9   street             380 non-null    object
 10  city               380 non-null    object
 11  state              380 non-null    object
 12  zip                380 non-null    object
dtypes: Int64(3), int64(2), object(8)
memory usage: 39.8+ KB
