### Handle missing/invalid data (drop or impute)

In [1]:
import pandas as pd

df = pd.read_csv("./Dataset/zomato_df_final_data.csv", header=0)
df.head(5)

Unnamed: 0,address,cost,cuisine,lat,link,lng,phone,rating_number,rating_text,subzone,title,type,votes,groupon,color,cost_2,cuisine_color
0,"371A Pitt Street, CBD, Sydney",50.0,"['Hot Pot', 'Korean BBQ', 'BBQ', 'Korean']",-33.876059,https://www.zomato.com/sydney/sydney-madang-cbd,151.207605,02 8318 0406,4.0,Very Good,CBD,Sydney Madang,['Casual Dining'],1311.0,False,#e15307,5.243902,#6f706b
1,"Shop 7A, 2 Huntley Street, Alexandria, Sydney",80.0,"['Cafe', 'Coffee and Tea', 'Salad', 'Poké']",-33.910999,https://www.zomato.com/sydney/the-grounds-of-a...,151.193793,02 9699 2225,4.6,Excellent,"The Grounds of Alexandria, Alexandria",The Grounds of Alexandria Cafe,['Café'],3236.0,False,#9c3203,7.560976,#6f706b
2,"Level G, The Darling at the Star, 80 Pyrmont ...",120.0,['Japanese'],-33.867971,https://www.zomato.com/sydney/sokyo-pyrmont,151.19521,1800 700 700,4.9,Excellent,"The Star, Pyrmont",Sokyo,['Fine Dining'],1227.0,False,#7f2704,10.650407,#6f706b
3,"Sydney Opera House, Bennelong Point, Circular...",270.0,['Modern Australian'],-33.856784,https://www.zomato.com/sydney/bennelong-restau...,151.215297,02 9240 8000,4.9,Excellent,Circular Quay,Bennelong Restaurant,"['Fine Dining', 'Bar']",278.0,False,#7f2704,22.235772,#4186f4
4,"20 Campbell Street, Chinatown, Sydney",55.0,"['Thai', 'Salad']",-33.879035,https://www.zomato.com/sydney/chat-thai-chinatown,151.206409,02 8317 4811,4.5,Excellent,Chinatown,Chat Thai,['Casual Dining'],2150.0,False,#a83703,5.630081,#6f706b


## 1. Feature Engineering
### Handle missing/invalid data (drop or impute)

In [2]:
# Check missing values
missing_count = df.isnull().sum().sort_values(ascending=False)
missing_count

rating_text      3316
rating_number    3316
votes            3316
cost_2            346
cost              346
lat               192
lng               192
type               48
color               0
groupon             0
address             0
title               0
subzone             0
phone               0
link                0
cuisine             0
cuisine_color       0
dtype: int64

In [3]:
import numpy as np

# Ensure numeric dtypes first (coerce bad strings to NaN)
df["rating_number"] = pd.to_numeric(df["rating_number"], errors="coerce")
df["votes"] = pd.to_numeric(df["votes"], errors="coerce")

df["rating_text"] = df["rating_text"].fillna("Unrated")

df["votes"] = (
    df["votes"]
      .fillna(0)
      .clip(lower=0)         # just in case there are negatives
      .astype(int)
)

df["rating_number"] = (
    df["rating_number"]
      .fillna(0.0)           # unknown/unrated
      .astype(float)
)

#print("Unrated rows (flag=True):", int(df["is_unrated"].sum()))
print(df[["rating_text","rating_number","votes"]].tail())
print(df[["rating_text","rating_number","votes"]].isna().sum())  # should be all zeros

      rating_text  rating_number  votes
10495     Unrated            0.0      0
10496     Unrated            0.0      0
10497     Unrated            0.0      0
10498     Unrated            0.0      0
10499     Unrated            0.0      0
rating_text      0
rating_number    0
votes            0
dtype: int64


In [4]:
# Ensure numeric dtype (in case of stray strings)
df["cost"] = pd.to_numeric(df["cost"], errors="coerce")

# Median-by-type imputation with global fallback
group_median = df.groupby("type")["cost"].transform("median")
global_median = df["cost"].median()

df["cost"] = df["cost"].fillna(group_median).fillna(global_median)

if "cost_2" in df.columns:
    df.drop(columns=["cost_2"], inplace=True)

print("Remaining NaNs in cost:", int(df["cost"].isna().sum()))

Remaining NaNs in cost: 0


In [5]:
df = df.dropna(subset=["type"])

In [6]:
df["type"].isna().sum()

0

In [7]:
df["lng"].isna().sum()

192

In [8]:
subzones_both_na = (
    df.loc[df["lat"].isna() & df["lng"].isna(), "subzone"]
      .dropna()
      .unique()
    .tolist()
)

In [9]:
len(subzones_both_na)

100

In [10]:
valid = df.loc[df["lat"].notna() & df["lng"].notna(), ["subzone", "lat", "lng"]]
first_by_subzone = (
    valid.groupby("subzone", as_index=True)[["lat", "lng"]]
         .first()   # first occurrence in current row order
)

# Limit to subzones that have missing coords somewhere and also have a valid example
matching_subzones = sorted(set(subzones_both_na) & set(first_by_subzone.index))

print("Matching subzones (both NA somewhere, but have a valid example elsewhere):", matching_subzones)

Matching subzones (both NA somewhere, but have a valid example elsewhere): ['Avalon', 'Bangor Shopping Centre, Bangor', 'Bankstown', 'Barangaroo', 'Belmore', 'Berala', 'Beverly Hills', 'Bondi Beach', 'Bondi Junction', 'Bonnyrigg', 'Brookvale', 'Burwood', 'CBD', 'Cabramatta', 'Camden', 'Campbelltown', 'Canley Vale', 'Carlingford', 'Castle Hill', 'Castle Towers, Castle Hill', 'Casula', 'Central Station, Chinatown', 'Chatswood', 'Chester Hill', 'Chinatown', 'Chippendale', 'Circular Quay', 'Coogee', 'Cosmopolitan Centre, Double Bay', 'Cronulla', 'Crows Nest', 'Danks Street Shopping Plaza, Waterloo', 'Darling Harbour', 'Darlinghurst', 'Darlington', 'Dee Why', 'Erskineville', 'Five Dock', 'Frenchs Forest', 'GPO Building, Gordon', 'Girraween', 'Glebe', 'Gordon', 'Grace Hotel ,CBD', 'Greystanes', 'Holsworthy', 'Hotel Steyne, Manly', 'Ingleburn', 'Kareela', 'Kensington', 'Kensington Street, Chippendale', 'Leura', 'Lithgow', 'Liverpool', 'Lugarno', 'MLC Building, North Sydney', 'MLC Tower, CBD',

In [11]:
print(first_by_subzone.loc[matching_subzones].head())


                                      lat         lng
subzone                                              
Avalon                         -33.634904  151.312135
Bangor Shopping Centre, Bangor -34.018483  151.030061
Bankstown                      -33.919187  151.032733
Barangaroo                     -33.864908  151.201519
Belmore                        -33.918575  151.087607


In [12]:
print(first_by_subzone.loc[matching_subzones].head())

# Create simple dict maps for fast filling
lat_map = first_by_subzone["lat"].to_dict()
lng_map = first_by_subzone["lng"].to_dict()

# Impute ONLY rows where both coords are NaN, and only if the subzone has a mapped value
mask_both_na = df["lat"].isna() & df["lng"].isna()
df.loc[mask_both_na, "lat"] = df.loc[mask_both_na, "subzone"].map(lat_map)
df.loc[mask_both_na, "lng"] = df.loc[mask_both_na, "subzone"].map(lng_map)

# Check result
print("Remaining NaNs after fill (lat, lng):",
      int(df["lat"].isna().sum()), int(df["lng"].isna().sum()))

                                      lat         lng
subzone                                              
Avalon                         -33.634904  151.312135
Bangor Shopping Centre, Bangor -34.018483  151.030061
Bankstown                      -33.919187  151.032733
Barangaroo                     -33.864908  151.201519
Belmore                        -33.918575  151.087607
Remaining NaNs after fill (lat, lng): 21 21


In [13]:
df = df.dropna(subset=["lat"])

In [14]:
df["lng"].isna().sum()

0

In [15]:
df.isnull().sum().sort_values(ascending=False)

address          0
cost             0
cuisine          0
lat              0
link             0
lng              0
phone            0
rating_number    0
rating_text      0
subzone          0
title            0
type             0
votes            0
groupon          0
color            0
cuisine_color    0
dtype: int64

In [16]:
# How many duplicate rows
df.duplicated().sum()


1

In [17]:
dups_all = df[df.duplicated(keep=False)]
dups_all.head(20)

Unnamed: 0,address,cost,cuisine,lat,link,lng,phone,rating_number,rating_text,subzone,title,type,votes,groupon,color,cuisine_color
599,"Level 3, Westfield Chatswood, 1 Anderson Stre...",45.0,['Burger'],-33.796892,https://www.zomato.com/sydney/royal-stacks-cha...,151.184111,02 9419 2354,3.7,Good,"Westfield Chatswood, Chatswood",Royal Stacks,['Fast Food'],94,False,#f26d17,#6f706b
600,"Level 3, Westfield Chatswood, 1 Anderson Stre...",45.0,['Burger'],-33.796892,https://www.zomato.com/sydney/royal-stacks-cha...,151.184111,02 9419 2354,3.7,Good,"Westfield Chatswood, Chatswood",Royal Stacks,['Fast Food'],94,False,#f26d17,#6f706b


In [18]:
df = df.drop_duplicates()

### Encode categorical features properly (Label Encoding, One-Hot, etc.).

In [19]:
cat_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
X_cat = df[cat_cols].copy()
cat_cols

['address',
 'cuisine',
 'link',
 'phone',
 'rating_text',
 'subzone',
 'title',
 'type',
 'groupon',
 'color',
 'cuisine_color']

In [20]:
# Drop a specific list of columns 
cols_to_drop = ["address", "link", "phone", "title", "color", "cuisine_color"]
df = df.drop(columns=cols_to_drop, errors="ignore")

In [21]:
df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

['cuisine', 'rating_text', 'subzone', 'type', 'groupon']

In [22]:
# One-hot (multi-hot) encode 'cuisine' with Top-K + "Other"

import pandas as pd

# Clean to a normalized, comma-separated string of cuisines 
cuis = (
    df["cuisine"].fillna("").astype(str)
      .str.replace(r"[\[\]\u2018\u2019\u201C\u201D'\"']", "", regex=True)  # remove quotes/brackets
      .str.lower()
      .str.replace(r"\s+", " ", regex=True)  # collapse spaces
      .str.strip()
)

# Split/ strip tokens -> dedupe per row -> rejoin (so get_dummies works cleanly)
cuis_list = cuis.str.split(",")
cuis_list = cuis_list.apply(lambda lst: [t.strip() for t in lst if t and t.strip()])
cuis_norm = cuis_list.apply(lambda lst: ",".join(sorted(set(lst))))  # unique & sorted tokens per row

In [23]:
cuis_norm

0         bbq,hot pot,korean,korean bbq
1        cafe,coffee and tea,poké,salad
2                              japanese
3                     modern australian
4                            salad,thai
                      ...              
10495                           chinese
10496                         beverages
10497                             sushi
10498                             sushi
10499                              thai
Name: cuisine, Length: 10430, dtype: object

In [24]:
dummies_full = cuis_norm.str.get_dummies(sep=",")
K = 30  
topK_cols = dummies_full.sum().sort_values(ascending=False).head(K).index.tolist()

topK_feats = dummies_full[topK_cols].copy().add_prefix("cuisine__")

# "Other" = has at least one cuisine token outside Top-K
other_flag = (dummies_full.drop(columns=topK_cols, errors="ignore").sum(axis=1) > 0).astype(int)
topK_feats["cuisine__Other"] = other_flag

# oin back to df 
df = pd.concat([df, topK_feats], axis=1) 

# check
print("Created cuisine feature columns:", [c for c in df.columns if c.startswith("cuisine__")][:10], "...")
print("Total cuisine feature columns:", len([c for c in df.columns if c.startswith("cuisine__")]))


Created cuisine feature columns: ['cuisine__cafe', 'cuisine__modern australian', 'cuisine__chinese', 'cuisine__italian', 'cuisine__pizza', 'cuisine__thai', 'cuisine__japanese', 'cuisine__asian', 'cuisine__burger', 'cuisine__indian'] ...
Total cuisine feature columns: 31


In [25]:
# df.drop(columns=["cuisine"], inplace=True) 
df.head()

Unnamed: 0,cost,cuisine,lat,lng,rating_number,rating_text,subzone,type,votes,groupon,...,cuisine__fast food,cuisine__vegetarian,cuisine__korean,cuisine__mediterranean,cuisine__tapas,cuisine__lebanese,cuisine__middle eastern,cuisine__malaysian,cuisine__turkish,cuisine__Other
0,50.0,"['Hot Pot', 'Korean BBQ', 'BBQ', 'Korean']",-33.876059,151.207605,4.0,Very Good,CBD,['Casual Dining'],1311,False,...,0,0,1,0,0,0,0,0,0,1
1,80.0,"['Cafe', 'Coffee and Tea', 'Salad', 'Poké']",-33.910999,151.193793,4.6,Excellent,"The Grounds of Alexandria, Alexandria",['Café'],3236,False,...,0,0,0,0,0,0,0,0,0,1
2,120.0,['Japanese'],-33.867971,151.19521,4.9,Excellent,"The Star, Pyrmont",['Fine Dining'],1227,False,...,0,0,0,0,0,0,0,0,0,0
3,270.0,['Modern Australian'],-33.856784,151.215297,4.9,Excellent,Circular Quay,"['Fine Dining', 'Bar']",278,False,...,0,0,0,0,0,0,0,0,0,0
4,55.0,"['Thai', 'Salad']",-33.879035,151.206409,4.5,Excellent,Chinatown,['Casual Dining'],2150,False,...,0,0,0,0,0,0,0,0,0,1


In [26]:
# One numeric feature reflecting how common the subzone is
freq = df["subzone"].value_counts()
df["subzone_freq"] = df["subzone"].map(freq).fillna(0).astype(int)

df["subzone_freq_norm"] = df["subzone_freq"] / len(df)


In [27]:
#df.drop(columns=["subzone", "subzone_freq"], inplace=True) 
df.head()

Unnamed: 0,cost,cuisine,lat,lng,rating_number,rating_text,subzone,type,votes,groupon,...,cuisine__korean,cuisine__mediterranean,cuisine__tapas,cuisine__lebanese,cuisine__middle eastern,cuisine__malaysian,cuisine__turkish,cuisine__Other,subzone_freq,subzone_freq_norm
0,50.0,"['Hot Pot', 'Korean BBQ', 'BBQ', 'Korean']",-33.876059,151.207605,4.0,Very Good,CBD,['Casual Dining'],1311,False,...,1,0,0,0,0,0,0,1,475,0.045542
1,80.0,"['Cafe', 'Coffee and Tea', 'Salad', 'Poké']",-33.910999,151.193793,4.6,Excellent,"The Grounds of Alexandria, Alexandria",['Café'],3236,False,...,0,0,0,0,0,0,0,1,2,0.000192
2,120.0,['Japanese'],-33.867971,151.19521,4.9,Excellent,"The Star, Pyrmont",['Fine Dining'],1227,False,...,0,0,0,0,0,0,0,0,17,0.00163
3,270.0,['Modern Australian'],-33.856784,151.215297,4.9,Excellent,Circular Quay,"['Fine Dining', 'Bar']",278,False,...,0,0,0,0,0,0,0,0,40,0.003835
4,55.0,"['Thai', 'Salad']",-33.879035,151.206409,4.5,Excellent,Chinatown,['Casual Dining'],2150,False,...,0,0,0,0,0,0,0,1,173,0.016587


In [28]:
df["groupon"] = (
    df["groupon"]
      .replace({True: 1, False: 0, "True": 1, "False": 0})
      .pipe(pd.to_numeric, errors="coerce")
      .fillna(0)
      .astype(int)
)
df.head()

  .replace({True: 1, False: 0, "True": 1, "False": 0})


Unnamed: 0,cost,cuisine,lat,lng,rating_number,rating_text,subzone,type,votes,groupon,...,cuisine__korean,cuisine__mediterranean,cuisine__tapas,cuisine__lebanese,cuisine__middle eastern,cuisine__malaysian,cuisine__turkish,cuisine__Other,subzone_freq,subzone_freq_norm
0,50.0,"['Hot Pot', 'Korean BBQ', 'BBQ', 'Korean']",-33.876059,151.207605,4.0,Very Good,CBD,['Casual Dining'],1311,0,...,1,0,0,0,0,0,0,1,475,0.045542
1,80.0,"['Cafe', 'Coffee and Tea', 'Salad', 'Poké']",-33.910999,151.193793,4.6,Excellent,"The Grounds of Alexandria, Alexandria",['Café'],3236,0,...,0,0,0,0,0,0,0,1,2,0.000192
2,120.0,['Japanese'],-33.867971,151.19521,4.9,Excellent,"The Star, Pyrmont",['Fine Dining'],1227,0,...,0,0,0,0,0,0,0,0,17,0.00163
3,270.0,['Modern Australian'],-33.856784,151.215297,4.9,Excellent,Circular Quay,"['Fine Dining', 'Bar']",278,0,...,0,0,0,0,0,0,0,0,40,0.003835
4,55.0,"['Thai', 'Salad']",-33.879035,151.206409,4.5,Excellent,Chinatown,['Casual Dining'],2150,0,...,0,0,0,0,0,0,0,1,173,0.016587


In [29]:
# Clean to comma-separated tokens from strings like "['Fine Dining', 'Bar']" or "Unknown"
type_str = (
    df["type"].astype(str)
      .str.replace(r"^\s*\[\s*", "", regex=True)   # drop leading '['
      .str.replace(r"\s*\]\s*$", "", regex=True)   # drop trailing ']'
      .str.replace(r"[\"“”‘’']", "", regex=True)   # remove quotes
      .str.replace(r"\s*,\s*", ",", regex=True)    # normalize commas
      .str.replace(r"\s+", " ", regex=True)        # collapse spaces
      .str.strip()
)

# Tokenize -> strip -> title-case -> drop empties
type_tokens = type_str.str.split(",").apply(
    lambda lst: [t.strip().title() for t in lst if isinstance(t, str) and t.strip()]
)

# Ensure at least one token (e.g., if it was "[]")
empty_mask = type_tokens.apply(len).eq(0)
if empty_mask.any():
    type_tokens.loc[empty_mask] = [["Unknown"]] * int(empty_mask.sum())

# Single-category variant (first token)
df["type_primary"] = type_tokens.str[0]

# Multi-hot one-hot across all tokens in the row
type_join = type_tokens.apply(lambda lst: ",".join(sorted(set(lst))))
type_dummies = type_join.str.get_dummies(sep=",").add_prefix("type__")
df = pd.concat([df, type_dummies], axis=1)

In [30]:
#df.drop(columns=["type"], inplace=True) 
df.head()

Unnamed: 0,cost,cuisine,lat,lng,rating_number,rating_text,subzone,type,votes,groupon,...,type__Casual Dining,type__Club,type__Dessert Parlour,type__Fast Food,type__Fine Dining,type__Food Court,type__Food Stall,type__Food Truck,type__Pub,type__Wine Bar
0,50.0,"['Hot Pot', 'Korean BBQ', 'BBQ', 'Korean']",-33.876059,151.207605,4.0,Very Good,CBD,['Casual Dining'],1311,0,...,1,0,0,0,0,0,0,0,0,0
1,80.0,"['Cafe', 'Coffee and Tea', 'Salad', 'Poké']",-33.910999,151.193793,4.6,Excellent,"The Grounds of Alexandria, Alexandria",['Café'],3236,0,...,0,0,0,0,0,0,0,0,0,0
2,120.0,['Japanese'],-33.867971,151.19521,4.9,Excellent,"The Star, Pyrmont",['Fine Dining'],1227,0,...,0,0,0,0,1,0,0,0,0,0
3,270.0,['Modern Australian'],-33.856784,151.215297,4.9,Excellent,Circular Quay,"['Fine Dining', 'Bar']",278,0,...,0,0,0,0,1,0,0,0,0,0
4,55.0,"['Thai', 'Salad']",-33.879035,151.206409,4.5,Excellent,Chinatown,['Casual Dining'],2150,0,...,1,0,0,0,0,0,0,0,0,0


### Create useful features (e.g., cuisine diversity, cost bins).
#### 1) Cuisine features (diversity per restaurant, primary cuisine, subzone diversity)

In [31]:
# Per-restaurant features
df["cuisine_count"] = cuis_list.apply(len)                          # diversity (# cuisines listed)
df["cuisine_primary"] = cuis_list.apply(lambda lst: (lst[0] if lst else "unknown")).str.title()

# Subzone-level cuisine diversity (unique cuisines per subzone)
tok = pd.DataFrame({"subzone": df["subzone"], "cuisine_list": cuis_list})
tok = tok.explode("cuisine_list")
tok["cuisine_list"] = tok["cuisine_list"].fillna("").astype(str).str.strip()
tok = tok[tok["cuisine_list"] != ""]

subzone_cuisine_div = (
    tok.groupby("subzone")["cuisine_list"].nunique().rename("subzone_cuisine_unique")
)
df = df.merge(subzone_cuisine_div, on="subzone", how="left")


In [32]:
df.head()

Unnamed: 0,cost,cuisine,lat,lng,rating_number,rating_text,subzone,type,votes,groupon,...,type__Fast Food,type__Fine Dining,type__Food Court,type__Food Stall,type__Food Truck,type__Pub,type__Wine Bar,cuisine_count,cuisine_primary,subzone_cuisine_unique
0,50.0,"['Hot Pot', 'Korean BBQ', 'BBQ', 'Korean']",-33.876059,151.207605,4.0,Very Good,CBD,['Casual Dining'],1311,0,...,0,0,0,0,0,0,0,4,Hot Pot,73
1,80.0,"['Cafe', 'Coffee and Tea', 'Salad', 'Poké']",-33.910999,151.193793,4.6,Excellent,"The Grounds of Alexandria, Alexandria",['Café'],3236,0,...,0,0,0,0,0,0,0,4,Cafe,5
2,120.0,['Japanese'],-33.867971,151.19521,4.9,Excellent,"The Star, Pyrmont",['Fine Dining'],1227,0,...,0,1,0,0,0,0,0,1,Japanese,19
3,270.0,['Modern Australian'],-33.856784,151.215297,4.9,Excellent,Circular Quay,"['Fine Dining', 'Bar']",278,0,...,0,1,0,0,0,0,0,1,Modern Australian,28
4,55.0,"['Thai', 'Salad']",-33.879035,151.206409,4.5,Excellent,Chinatown,['Casual Dining'],2150,0,...,0,0,0,0,0,0,0,2,Thai,53


#### 2) Cost features (bins, log transform, price levels)

In [33]:
# Ensure numeric
df["cost"] = pd.to_numeric(df["cost"], errors="coerce")

# Cost bins (choose edges that match your EDA)
cost_edges = [0, 20, 40, 60, 80, 120, 200, np.inf]
cost_labels = ["0–20", "20–40", "40–60", "60–80", "80–120", "120–200", "200+"]

df["cost_bin"] = pd.cut(df["cost"], bins=cost_edges, labels=cost_labels, right=False, include_lowest=True)

# Compact price levels (3 buckets)
price_edges = [0, 40, 80, np.inf]
price_labels = ["Budget", "Mid", "Premium"]
df["price_level"] = pd.cut(df["cost"], bins=price_edges, labels=price_labels, right=False, include_lowest=True)

# Log transform (helps with skew)
df["log_cost"] = np.log1p(df["cost"])

In [34]:
df.head()

Unnamed: 0,cost,cuisine,lat,lng,rating_number,rating_text,subzone,type,votes,groupon,...,type__Food Stall,type__Food Truck,type__Pub,type__Wine Bar,cuisine_count,cuisine_primary,subzone_cuisine_unique,cost_bin,price_level,log_cost
0,50.0,"['Hot Pot', 'Korean BBQ', 'BBQ', 'Korean']",-33.876059,151.207605,4.0,Very Good,CBD,['Casual Dining'],1311,0,...,0,0,0,0,4,Hot Pot,73,40–60,Mid,3.931826
1,80.0,"['Cafe', 'Coffee and Tea', 'Salad', 'Poké']",-33.910999,151.193793,4.6,Excellent,"The Grounds of Alexandria, Alexandria",['Café'],3236,0,...,0,0,0,0,4,Cafe,5,80–120,Premium,4.394449
2,120.0,['Japanese'],-33.867971,151.19521,4.9,Excellent,"The Star, Pyrmont",['Fine Dining'],1227,0,...,0,0,0,0,1,Japanese,19,120–200,Premium,4.795791
3,270.0,['Modern Australian'],-33.856784,151.215297,4.9,Excellent,Circular Quay,"['Fine Dining', 'Bar']",278,0,...,0,0,0,0,1,Modern Australian,28,200+,Premium,5.602119
4,55.0,"['Thai', 'Salad']",-33.879035,151.206409,4.5,Excellent,Chinatown,['Casual Dining'],2150,0,...,0,0,0,0,2,Thai,53,40–60,Mid,4.025352


#### 3) Votes features (skew handling + bins)

In [35]:
df["votes"] = pd.to_numeric(df["votes"], errors="coerce").fillna(0).clip(lower=0).astype(int)
df["log_votes"] = np.log1p(df["votes"])

vote_edges = [0, 10, 50, 200, 1000, np.inf]
vote_labels = ["0–9", "10–49", "50–199", "200–999", "1000+"]
df["votes_bin"] = pd.cut(df["votes"], bins=vote_edges, labels=vote_labels, right=False, include_lowest=True)

#### 4) Type features (count of types listed; keep if you already cleaned type)

In [36]:
type_str = (
    df["type"].astype(str)
      .str.replace(r"^\s*\[\s*", "", regex=True)
      .str.replace(r"\s*\]\s*$", "", regex=True)
      .str.replace(r"[\"“”‘’']", "", regex=True)
      .str.replace(r"\s*,\s*", ",", regex=True)
      .str.strip()
)
type_tokens = type_str.str.split(",").apply(lambda lst: [t.strip() for t in lst if t and t.strip()])
df["type_count"] = type_tokens.apply(len).astype(int)   # e.g., ['Fine Dining','Bar'] -> 2


In [37]:
df.head()

Unnamed: 0,cost,cuisine,lat,lng,rating_number,rating_text,subzone,type,votes,groupon,...,type__Wine Bar,cuisine_count,cuisine_primary,subzone_cuisine_unique,cost_bin,price_level,log_cost,log_votes,votes_bin,type_count
0,50.0,"['Hot Pot', 'Korean BBQ', 'BBQ', 'Korean']",-33.876059,151.207605,4.0,Very Good,CBD,['Casual Dining'],1311,0,...,0,4,Hot Pot,73,40–60,Mid,3.931826,7.179308,1000+,1
1,80.0,"['Cafe', 'Coffee and Tea', 'Salad', 'Poké']",-33.910999,151.193793,4.6,Excellent,"The Grounds of Alexandria, Alexandria",['Café'],3236,0,...,0,4,Cafe,5,80–120,Premium,4.394449,8.082402,1000+,1
2,120.0,['Japanese'],-33.867971,151.19521,4.9,Excellent,"The Star, Pyrmont",['Fine Dining'],1227,0,...,0,1,Japanese,19,120–200,Premium,4.795791,7.113142,1000+,1
3,270.0,['Modern Australian'],-33.856784,151.215297,4.9,Excellent,Circular Quay,"['Fine Dining', 'Bar']",278,0,...,0,1,Modern Australian,28,200+,Premium,5.602119,5.631212,200–999,2
4,55.0,"['Thai', 'Salad']",-33.879035,151.206409,4.5,Excellent,Chinatown,['Casual Dining'],2150,0,...,0,2,Thai,53,40–60,Mid,4.025352,7.673688,1000+,1


#### 5) Geography features (distance to CBD in km)

In [38]:
# Requires lat/lng present (you already imputed/validated earlier)
# Sydney CBD reference
CBD_LAT, CBD_LNG = -33.8688, 151.2093

lat1 = np.radians(df["lat"].astype(float))
lng1 = np.radians(df["lng"].astype(float))
lat2 = np.radians(CBD_LAT)
lng2 = np.radians(CBD_LNG)

dlat = lat1 - lat2
dlng = lng1 - lng2
a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlng/2)**2
df["dist_cbd_km"] = 6371 * 2 * np.arcsin(np.sqrt(a))   # Haversine distance

### 6) Subzone aggregate signals (density, typical price)

In [39]:
# Restaurant density per subzone
subzone_counts = df.groupby("subzone").size().rename("subzone_rest_count")
# Typical price (median) per subzone
subzone_cost_median = df.groupby("subzone")["cost"].median().rename("subzone_cost_median")

df = df.merge(subzone_counts, on="subzone", how="left")
df = df.merge(subzone_cost_median, on="subzone", how="left")

In [40]:
df.head()

Unnamed: 0,cost,cuisine,lat,lng,rating_number,rating_text,subzone,type,votes,groupon,...,subzone_cuisine_unique,cost_bin,price_level,log_cost,log_votes,votes_bin,type_count,dist_cbd_km,subzone_rest_count,subzone_cost_median
0,50.0,"['Hot Pot', 'Korean BBQ', 'BBQ', 'Korean']",-33.876059,151.207605,4.0,Very Good,CBD,['Casual Dining'],1311,0,...,73,40–60,Mid,3.931826,7.179308,1000+,1,0.822155,475,45.0
1,80.0,"['Cafe', 'Coffee and Tea', 'Salad', 'Poké']",-33.910999,151.193793,4.6,Excellent,"The Grounds of Alexandria, Alexandria",['Café'],3236,0,...,5,80–120,Premium,4.394449,8.082402,1000+,1,4.905778,2,72.5
2,120.0,['Japanese'],-33.867971,151.19521,4.9,Excellent,"The Star, Pyrmont",['Fine Dining'],1227,0,...,19,120–200,Premium,4.795791,7.113142,1000+,1,1.304138,17,65.0
3,270.0,['Modern Australian'],-33.856784,151.215297,4.9,Excellent,Circular Quay,"['Fine Dining', 'Bar']",278,0,...,28,200+,Premium,5.602119,5.631212,200–999,2,1.446261,40,80.0
4,55.0,"['Thai', 'Salad']",-33.879035,151.206409,4.5,Excellent,Chinatown,['Casual Dining'],2150,0,...,53,40–60,Mid,4.025352,7.673688,1000+,1,1.168966,173,50.0


In [41]:
list(df.columns)

['cost',
 'cuisine',
 'lat',
 'lng',
 'rating_number',
 'rating_text',
 'subzone',
 'type',
 'votes',
 'groupon',
 'cuisine__cafe',
 'cuisine__modern australian',
 'cuisine__chinese',
 'cuisine__italian',
 'cuisine__pizza',
 'cuisine__thai',
 'cuisine__japanese',
 'cuisine__asian',
 'cuisine__burger',
 'cuisine__indian',
 'cuisine__seafood',
 'cuisine__vietnamese',
 'cuisine__pub food',
 'cuisine__sushi',
 'cuisine__bar food',
 'cuisine__sandwich',
 'cuisine__coffee and tea',
 'cuisine__australian',
 'cuisine__bakery',
 'cuisine__healthy food',
 'cuisine__desserts',
 'cuisine__fast food',
 'cuisine__vegetarian',
 'cuisine__korean',
 'cuisine__mediterranean',
 'cuisine__tapas',
 'cuisine__lebanese',
 'cuisine__middle eastern',
 'cuisine__malaysian',
 'cuisine__turkish',
 'cuisine__Other',
 'subzone_freq',
 'subzone_freq_norm',
 'type_primary',
 'type__Bakery',
 'type__Bar',
 'type__Beverage Shop',
 'type__Café',
 'type__Casual Dining',
 'type__Club',
 'type__Dessert Parlour',
 'type__Fa

In [42]:
# Drop raw or redundant columns; keep engineered features
cols_to_drop = [
    # raw text / IDs
    "cuisine", "subzone", "type",
    # raw geocoords (dist_cbd_km instead)
    "lat", "lng",
    # raw + binned versions where we keep logs
    "cost", "cost_bin", "price_level",
    "votes", "votes_bin",
    # duplicate encodings
    "subzone_freq_norm",   # keep 'subzone_freq'
    "cuisine_primary",     # keep cuisine multi-hot + cuisine_count
    "type_primary"         # keep type__* + type_count
]

df_feat = df.drop(columns=cols_to_drop, errors="ignore").copy()

print("Final feature columns:", len(df_feat.columns))
print(df_feat.columns.tolist())


Final feature columns: 57
['rating_number', 'rating_text', 'groupon', 'cuisine__cafe', 'cuisine__modern australian', 'cuisine__chinese', 'cuisine__italian', 'cuisine__pizza', 'cuisine__thai', 'cuisine__japanese', 'cuisine__asian', 'cuisine__burger', 'cuisine__indian', 'cuisine__seafood', 'cuisine__vietnamese', 'cuisine__pub food', 'cuisine__sushi', 'cuisine__bar food', 'cuisine__sandwich', 'cuisine__coffee and tea', 'cuisine__australian', 'cuisine__bakery', 'cuisine__healthy food', 'cuisine__desserts', 'cuisine__fast food', 'cuisine__vegetarian', 'cuisine__korean', 'cuisine__mediterranean', 'cuisine__tapas', 'cuisine__lebanese', 'cuisine__middle eastern', 'cuisine__malaysian', 'cuisine__turkish', 'cuisine__Other', 'subzone_freq', 'type__Bakery', 'type__Bar', 'type__Beverage Shop', 'type__Café', 'type__Casual Dining', 'type__Club', 'type__Dessert Parlour', 'type__Fast Food', 'type__Fine Dining', 'type__Food Court', 'type__Food Stall', 'type__Food Truck', 'type__Pub', 'type__Wine Bar', '

## 2. Regression Models

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [44]:
X = df_feat.drop(['rating_number'], axis=1)
y = df_feat['rating_number']
# Keep only numeric features
X = X.select_dtypes(include=[np.number])
print("Final shapes -> X:", X.shape, " y:", y.shape)

Final shapes -> X: (10430, 55)  y: (10430,)


In [45]:
# Train/test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

In [46]:
import time
# Linear Regression (scikit-learn)
t0 = time.time()
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
t1 = time.time()
print(f"Model A (LinearRegression) MSE: {mse_lr:.4f}")
print(f"(time taken: {t1 - t0:.2f}s)")

Model A (LinearRegression) MSE: 0.3737
(time taken: 0.05s)


In [47]:
# Linear Regression via Gradient Descent

# Standardize X and y (helps GD converge)
scaler_X = StandardScaler(with_mean=True, with_std=True)
scaler_y = StandardScaler(with_mean=True, with_std=True)

X_train_std = scaler_X.fit_transform(X_train)
X_test_std  = scaler_X.transform(X_test)

y_train_std = scaler_y.fit_transform(y_train.reshape(-1,1)).ravel()

# Add bias term (column of ones)
Xb_train = np.hstack([np.ones((X_train_std.shape[0], 1)), X_train_std])
Xb_test  = np.hstack([np.ones((X_test_std.shape[0], 1)),  X_test_std])

# Initialize weights
rng = np.random.default_rng(42)
w = np.zeros(Xb_train.shape[1])  # [bias, w1, ..., wd]

# Hyperparameters
alpha = 0.05      # learning rate
epochs = 2000     # iterations

# Gradient descent loop
m = Xb_train.shape[0]
for epoch in range(epochs):
    yhat = Xb_train @ w
    err = yhat - y_train_std
    grad = (Xb_train.T @ err) / m
    w -= alpha * grad

# Predict on test (invert the standardization of y)
y_pred_std = Xb_test @ w
y_pred_gd = scaler_y.inverse_transform(y_pred_std.reshape(-1,1)).ravel()

mse_gd = mean_squared_error(y_test, y_pred_gd)
print(f"Model B (Gradient Descent) MSE: {mse_gd:.4f}")


Model B (Gradient Descent) MSE: 0.3737


## 3. Classification Models

In [48]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [49]:
# Clean label text
df_feat["rating_text_clean"] = (
    df_feat["rating_text"].astype(str)
      .str.replace(r"[\[\]\u2018\u2019\u201C\u201D'\"']", "", regex=True)
      .str.strip().str.replace(r"\s+", " ", regex=True)
      .str.title().replace({"Not Rated": "Unrated"})
)

In [50]:
df_feat["rating_text_clean"]

0        Very Good
1        Excellent
2        Excellent
3        Excellent
4        Excellent
           ...    
10425      Unrated
10426      Unrated
10427      Unrated
10428      Unrated
10429      Unrated
Name: rating_text_clean, Length: 10430, dtype: object

In [51]:
mask_not_rated = (
    df_feat["rating_text"].astype(str)
      .str.replace(r"[\[\]\u2018\u2019\u201C\u201D'\"']", "", regex=True)  # strip quotes/brackets
      .str.strip().str.replace(r"\s+", " ", regex=True)
      .str.lower()
      .isin(["not rated", "unrated"])
)

# Only 'not rated' rows
df_not_rated = df_feat[mask_not_rated].copy()
df_rated = df_feat[~mask_not_rated].copy()

In [52]:
# Map to binary classes; drop Unrated/others
class1 = {"Poor", "Average"}
class2 = {"Good", "Very Good", "Excellent"}

mask_keep = df_rated["rating_text_clean"].isin(class1 | class2)
df_bin = df_rated.loc[mask_keep].copy()

y = df_bin["rating_text_clean"].isin(class2).astype(int)  # Class2=1, Class1=0


In [53]:
X = df_bin.select_dtypes(include=[np.number]).copy()

In [54]:
leak_cols = ["rating_number", "rating_text_ord"]  # add others if present
X = X.drop(columns=[c for c in leak_cols if c in X.columns], errors="ignore").select_dtypes(include=[np.number])

In [55]:
X.columns

Index(['groupon', 'cuisine__cafe', 'cuisine__modern australian',
       'cuisine__chinese', 'cuisine__italian', 'cuisine__pizza',
       'cuisine__thai', 'cuisine__japanese', 'cuisine__asian',
       'cuisine__burger', 'cuisine__indian', 'cuisine__seafood',
       'cuisine__vietnamese', 'cuisine__pub food', 'cuisine__sushi',
       'cuisine__bar food', 'cuisine__sandwich', 'cuisine__coffee and tea',
       'cuisine__australian', 'cuisine__bakery', 'cuisine__healthy food',
       'cuisine__desserts', 'cuisine__fast food', 'cuisine__vegetarian',
       'cuisine__korean', 'cuisine__mediterranean', 'cuisine__tapas',
       'cuisine__lebanese', 'cuisine__middle eastern', 'cuisine__malaysian',
       'cuisine__turkish', 'cuisine__Other', 'subzone_freq', 'type__Bakery',
       'type__Bar', 'type__Beverage Shop', 'type__Café', 'type__Casual Dining',
       'type__Club', 'type__Dessert Parlour', 'type__Fast Food',
       'type__Fine Dining', 'type__Food Court', 'type__Food Stall',
       'type_

In [56]:
# Remove rows with missing features just in case
mask_nan = X.isna().any(axis=1)
if mask_nan.any():
    X = X.loc[~mask_nan]
    y = y.loc[X.index]

print("Shapes after filtering -> X:", X.shape, " y:", y.shape, "  (positive rate =", y.mean().round(3), ")")

Shapes after filtering -> X: (7148, 55)  y: (7148,)   (positive rate = 0.345 )


In [57]:
# Train / Test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Define models
models = {
    "LogisticRegression": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=1000, random_state=42))
    ]),
    "RandomForest": RandomForestClassifier(
        n_estimators=300, max_depth=None, random_state=42, n_jobs=-1
    ),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "SVM_RBF": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", SVC(kernel="rbf", probability=False, random_state=42))
    ])
}

# rain, predict, evaluate
rows = []
cms = {}

for name, model in models.items():
    t0 = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # Metrics (binary: positive class = 1)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec  = recall_score(y_test, y_pred, zero_division=0)
    f1   = f1_score(y_test, y_pred, zero_division=0)
    cm   = confusion_matrix(y_test, y_pred, labels=[0,1])
    t1 = time.time()

    rows.append({
        "model": name,
        "precision": round(prec, 4),
        "recall": round(rec, 4),
        "f1": round(f1, 4),
        "time taken":t1 - t0
    })
    cms[name] = pd.DataFrame(cm, index=["Actual_0","Actual_1"], columns=["Pred_0","Pred_1"])

# Results table
results_cls = pd.DataFrame(rows).sort_values("f1", ascending=False).reset_index(drop=True)
print("\nClassification results (sorted by F1):")
print(results_cls)

# Confusion matrices (print one-by-one; example shows Logistic Regression)
print("\nConfusion Matrix — Logistic Regression:")
print(cms["LogisticRegression"])

# To see others:
print("\nConfusion Matrix — Random Forest:\n", cms["RandomForest"])
print("\nConfusion Matrix — Gradient Boosting:\n", cms["GradientBoosting"])
print("\nConfusion Matrix — SVM_RBF:\n", cms["SVM_RBF"])


Classification results (sorted by F1):
                model  precision  recall      f1  time taken
0    GradientBoosting     0.7922  0.8195  0.8056    1.163146
1        RandomForest     0.8108  0.7911  0.8008    1.117154
2  LogisticRegression     0.8174  0.7809  0.7988    0.075113
3             SVM_RBF     0.7832  0.7546  0.7686    1.327969

Confusion Matrix — Logistic Regression:
          Pred_0  Pred_1
Actual_0     851      86
Actual_1     108     385

Confusion Matrix — Random Forest:
           Pred_0  Pred_1
Actual_0     846      91
Actual_1     103     390

Confusion Matrix — Gradient Boosting:
           Pred_0  Pred_1
Actual_0     831     106
Actual_1      89     404

Confusion Matrix — SVM_RBF:
           Pred_0  Pred_1
Actual_0     834     103
Actual_1     121     372


### PySpark

In [58]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/29 13:58:38 WARN Utils: Your hostname, Farihas-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 10.33.90.205 instead (on interface en0)
25/09/29 13:58:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/29 13:58:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [59]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [60]:
X = df_feat.copy()
y_reg = pd.to_numeric(df["rating_number"], errors="coerce")
y_cls = df_bin["rating_text_clean"].isin(class2).astype(int)

In [62]:
import re
# Assemble one pandas frame
df_model = X.copy()
df_model["rating_number"] = y_reg
df_model["label"] = y_cls  # for classification

# Keep only rows that have the needed targets
df_reg = df_model.dropna(subset=["rating_number"]).copy()
df_cls = df_model.dropna(subset=["label"]).copy()

# 1) Sanitize column names (no spaces/specials for Spark)
def sanitize_cols(cols):
    out = []
    for c in cols:
        c2 = re.sub(r"[^A-Za-z0-9_]", "_", c)   # non-alnum -> _
        c2 = re.sub(r"_+", "_", c2).strip("_")  # collapse underscores
        if c2 == "":
            c2 = "col"
        out.append(c2)
    return out

reg_cols_old = df_reg.columns.tolist()
reg_cols_new = sanitize_cols(reg_cols_old)
df_reg.columns = reg_cols_new

cls_cols_old = df_cls.columns.tolist()
cls_cols_new = sanitize_cols(cls_cols_old)
df_cls.columns = cls_cols_new

# Identify final feature columns = all numeric except targets
reg_feature_cols = df_reg.select_dtypes(include=[np.number]).columns.difference(["rating_number","label"]).tolist()
cls_feature_cols = df_cls.select_dtypes(include=[np.number]).columns.difference(["rating_number","label"]).tolist()

# Convert to Spark
sdf_reg = spark.createDataFrame(df_reg)
sdf_cls = spark.createDataFrame(df_cls)

print("Regression features:", len(reg_feature_cols))
print("Classification features:", len(cls_feature_cols))

Regression features: 55
Classification features: 55


#### Regression in PySpark (Linear Regression)

In [63]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
import time

# Assembler -> (optional) scaler -> LinearRegression
reg_assembler = VectorAssembler(inputCols=reg_feature_cols, outputCol="features_raw")
reg_scaler = StandardScaler(inputCol="features_raw", outputCol="features", withMean=True, withStd=True)
reg_model = LinearRegression(featuresCol="features", labelCol="rating_number", maxIter=200)

reg_pipeline = Pipeline(stages=[reg_assembler, reg_scaler, reg_model])

# Split 80/20
train_reg, test_reg = sdf_reg.randomSplit([0.8, 0.2], seed=42)

t0 = time.time()
reg_fit = reg_pipeline.fit(train_reg)
pred_reg = reg_fit.transform(test_reg)
t1 = time.time()
# MSE
reg_eval = RegressionEvaluator(labelCol="rating_number", predictionCol="prediction", metricName="mse")
mse_reg_spark = reg_eval.evaluate(pred_reg)

print(f"[PySpark] Regression MSE: {mse_reg_spark:.4f}")
print(f"(time: {t1 - t0:.2f}s)")


25/09/29 14:00:50 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/09/29 14:00:54 WARN Instrumentation: [ec4aac6a] regParam is zero, which might cause numerical instability and overfitting.
25/09/29 14:00:55 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/09/29 14:00:56 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
25/09/29 14:00:56 WARN Instrumentation: [ec4aac6a] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
[Stage 7:>                                                          (0 + 8) / 8]

[PySpark] Regression MSE: 0.3746
(time: 11.24s)


                                                                                

#### Classification in PySpark (Logistic Regression)

In [64]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import functions as F
import time

# Ensure label is integer type
sdf_cls = sdf_cls.withColumn("label", F.col("label").cast("int"))

# Assembler -> (optional) scaler -> LogisticRegression
cls_assembler = VectorAssembler(inputCols=cls_feature_cols, outputCol="features_raw")
cls_scaler = StandardScaler(inputCol="features_raw", outputCol="features", withMean=True, withStd=True)
cls_model = LogisticRegression(featuresCol="features", labelCol="label", maxIter=200)

cls_pipeline = Pipeline(stages=[cls_assembler, cls_scaler, cls_model])

# Split 80/20 (stratification isn't native; randomSplit is fine at this scale)
train_cls, test_cls = sdf_cls.randomSplit([0.8, 0.2], seed=42)

t0 = time.time()
cls_fit = cls_pipeline.fit(train_cls)
pred_cls = cls_fit.transform(test_cls)
t1 = time.time()
# AUC (binary)
auc = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC").evaluate(pred_cls)

# Confusion matrix & metrics (precision/recall/F1) computed manually
cm = (pred_cls
      .groupBy("label", "prediction")
      .count()
      .toPandas()
      .pivot(index="label", columns="prediction", values="count")
      .fillna(0)
      .astype(int))

tn = cm.at[0,0] if (0 in cm.index and 0 in cm.columns) else 0
fp = cm.at[0,1] if (0 in cm.index and 1 in cm.columns) else 0
fn = cm.at[1,0] if (1 in cm.index and 0 in cm.columns) else 0
tp = cm.at[1,1] if (1 in cm.index and 1 in cm.columns) else 0

precision = tp / (tp + fp) if (tp + fp) else 0.0
recall    = tp / (tp + fn) if (tp + fn) else 0.0
f1        = 2*precision*recall / (precision + recall) if (precision + recall) else 0.0

print(f"[PySpark] Classification AUC: {auc:.4f}  Precision: {precision:.4f}  Recall: {recall:.4f}  F1: {f1:.4f} ) (time: {t1 - t0:.2f}s)")
print("\nConfusion matrix (rows=Actual 0/1, cols=Pred 0/1):\n", cm)


                                                                                

[PySpark] Classification AUC: 0.9303  Precision: 0.8206  Recall: 0.7546  F1: 0.7863 ) (time: 7.66s)

Confusion matrix (rows=Actual 0/1, cols=Pred 0/1):
 prediction  0.0  1.0
label               
0           860   80
1           119  366


In [65]:
import os, json, joblib, datetime as dt
os.makedirs("models", exist_ok=True)
joblib.dump(lr, "models/sklearn_linear_regression.pkl")

gd_bundle = {
    "weights": w.tolist(),                          # [bias, w1..wd]
    "feature_names": X.columns.tolist(),
    "scaler_X_mean": getattr(scaler_X, "mean_", []).tolist(),
    "scaler_X_scale": getattr(scaler_X, "scale_", []).tolist(),
    "scaler_y_mean": float(getattr(scaler_y, "mean_", [0])[0]),
    "scaler_y_scale": float(getattr(scaler_y, "scale_", [1])[0]),
    "alpha": alpha,
    "epochs": epochs,
    "saved_at": dt.datetime.utcnow().isoformat() + "Z"
}
joblib.dump(gd_bundle, "models/linear_regression_gd.pkl")

# Classification
# models dict from earlier step already fitted in-place
joblib.dump(models["LogisticRegression"], "models/cls_logreg.pkl")
joblib.dump(models["RandomForest"],       "models/cls_random_forest.pkl")
joblib.dump(models["GradientBoosting"],   "models/cls_gradient_boosting.pkl")
joblib.dump(models["SVM_RBF"],            "models/cls_svm_rbf.pkl")

print("Saved scikit-learn models to 'models/'")

# Save fitted Spark PipelineModels (reg_fit, cls_fit)
reg_fit.write().overwrite().save("models/spark_regression_pipeline")
cls_fit.write().overwrite().save("models/spark_classification_pipeline")

print("Saved Spark PipelineModels under 'models/'")


  "saved_at": dt.datetime.utcnow().isoformat() + "Z"


Saved scikit-learn models to 'models/'


                                                                                

Saved Spark PipelineModels under 'models/'
