In [22]:
from bs4 import BeautifulSoup
import requests

In [23]:
# Import HayDayWiki URL and create BeautifulSoup #
url = 'https://hayday.fandom.com/wiki/Goods_List'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

In [24]:
print(soup)

<!DOCTYPE html>
<html class="client-nojs sse-other l2u-other" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Goods List | Hay Day Wiki | Fandom</title>
<script>document.documentElement.className="client-js sse-other l2u-other";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"9eaae2338c670600c86527255d1be554","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Goods_List","wgTitle":"Goods List","wgCurRevisionId":108842,"wgRevisionId":108842,"wgArticleId":18954,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Note","Animal Goods","Crops","Products","Guides"],"wgPageViewLanguage":"en","wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRel

In [25]:
# Find targetted table in wiki #
table = soup.find('table')
print(table)

<table border="0" cellpadding="1" cellspacing="1" class="sortable wikitable" style="max-width: 100%; text-align: center; margin: 1em 0;">
<tbody><tr>
<th scope="col">Name
</th>
<th scope="col"><a href="/wiki/Experience" title="Experience">Level</a>
</th>
<th scope="col">Max. price
</th>
<th class="unsortable" scope="col">Time
</th>
<th scope="col">XP
</th>
<th class="unsortable" scope="col">Needs
</th>
<th scope="col">Source
</th>
<th scope="col">Per boat crate
</th></tr>
<tr>
<td><b><a href="/wiki/Wheat" title="Wheat">Wheat</a></b><br/><span typeof="mw:File"><a class="mw-file-description image" href="https://static.wikia.nocookie.net/hayday/images/e/e2/Wheat.png/revision/latest?cb=20240218150024"><img alt="Wheat" class="mw-file-element" data-image-key="Wheat.png" data-image-name="Wheat.png" data-relevant="1" decoding="async" height="100" loading="lazy" src="https://static.wikia.nocookie.net/hayday/images/e/e2/Wheat.png/revision/latest/scale-to-width-down/100?cb=20240218150024" width="

In [26]:
# Extract column titles for DF #
goods_titles = soup.find_all('th')
goods_table_titles = [title.text.strip() for title in goods_titles]
print(goods_table_titles)

['Name', 'Level', 'Max. price', 'Time', 'XP', 'Needs', 'Source', 'Per boat crate']


In [27]:
# Set up DF using column titles #
import pandas as pd
import re
df = pd.DataFrame(columns = goods_table_titles)
df

Unnamed: 0,Name,Level,Max. price,Time,XP,Needs,Source,Per boat crate


In [28]:
# Extracting data for each row in DF
column_row = table.find_all('tr')
for row in column_row[1:]:
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]
    
    length = len(df)
    df.loc[length] = individual_row_data

# Cleaning up columns
df['Name'] = df['Name'].str.strip().str.replace('\xa0', ' ', regex=False)
df['Time'] = df['Time'].str.replace(r'★.*', '', regex=True).str.strip()
df['Needs'] = df['Needs'].str.replace(r'\)\s*(?=[A-Z])', '), ', regex=True)
df = df.replace(r'Blueberries', 'Blueberry', regex=True)

# # Cleaning up specific items
df.loc[df['Name'] == 'Red lure', 'Needs'] = 'N/A'
df.loc[df['Name'] == 'Green lure', 'Needs'] = 'N/A'
df.loc[df['Name'] == 'Blue lure', 'Needs'] = 'N/A'
df.loc[df['Name'] == 'Purple lure', 'Needs'] = 'N/A'
df.loc[df['Name'] == 'Gold lure', 'Needs'] = 'N/A'
df.loc[df['Name'] == 'Fish fillet', 'Needs'] = 'Gold lure (1)'
df.loc[df['Name'].isin(['Fishing net', 'Mystery net']), 'Needs'] = 'N/A'
df.loc[df['Name'] == 'Lobster tail', 'Needs'] = 'Lobster trap (1)'
df.loc[df['Name'] == 'Duck feather', 'Needs'] = 'Duck trap (1)'
if "Needs" not in df.columns: df["Needs"] = ""

# Cleaning up diamond items
df.loc[df['Name'] == 'Diamond ring', 'Needs'] = 'Gold bar (2), Platinum bar (2)'

# Adding yield to df
df['Yield_qty'] = 1.0
df.loc[df['Source'].astype(str).str.contains(r'Field|Tree|Bush', case=False, na=False), 'Yield_qty'] = 2.0
df.loc[df['Source'].astype(str).str.contains(r'Feed', case=False, na=False), 'Yield_qty'] = 3.0

# Cleaning peanut and honeycomb
df.loc[df['Name'] == 'Peanuts', 'Time'] = '5 hr'
df.loc[df['Name'] == 'Honeycomb', 'Time'] = '35 min'

# Parsers & Helpers
def norm(s: str) -> str:
    if pd.isna(s): return ""
    s = str(s)
    s = s.replace("\u00a0"," ").replace("–","-").replace("—","-")
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

def parse_first_int(x, default=0) -> int:
    if pd.isna(x): return default
    m = re.search(r"\d+", str(x))
    return int(m.group()) if m else default

def parse_time_to_minutes(s: str) -> float:
    s = (s or "").replace("\u00a0"," ").lower().strip()
    s = (s.replace("hours","h").replace("hour","h").replace("hrs","h").replace("hr","h").replace("minutes","min").replace("minute","min"))
    
    # Robustly check for d/h/min
    d = re.search(r"(\d+)\s*d", s)
    h = re.search(r"(\d+)\s*h", s)
    m = re.search(r"(\d+)\s*min", s)
    
    total_min = (int(d.group(1)) * 1440 if d else 0) + \
                (int(h.group(1)) * 60 if h else 0) + \
                (int(m.group(1)) if m else 0)

    if total_min > 0:
        return total_min
    
    try: return float(s)
    except: return 0.0

def parse_needs(cell: str) -> dict:
    if cell is None or (isinstance(cell, float) and pd.isna(cell)): return {}
    raw = str(cell).replace("\u00a0"," ").strip()
    if raw == "" or raw == "{}": return {}
    try:
        d = json.loads(raw)
        return {norm(k): float(v) for k, v in d.items()}
    except Exception:
        pass
    out = {}
    for part in re.split(r",\s*", raw):
        m = re.match(r"(.+?)\s*\(\s*([0-9]+(?:\.[0-9]+)?)\s*\)\s*$", part)
        if m:
            out[norm(m.group(1))] = out.get(norm(m.group(1)), 0.0) + float(m.group(2))
    return out

def clear_self_need(row):
    n = row["name_norm"]
    nd = dict(row["needs_raw"])
    if nd.get(n, 0) == 1 and len(nd) == 1:
        return {}
    return nd

def extract_building(source: str) -> str:
    if pd.isna(source): return "Field" 
    s = str(source)
    match = re.split(r"[\(]", s, 1)[0].strip()
    match = re.split(r"[,]", match, 1)[0].strip()
    return norm(match)

def get_production_type(source: str) -> str:
    s = str(source).lower()
    # Define machine-made goods for filtering
    if 'mine' in s or 'instant' in s or 'field' in s or 'tree' in s or 'bush' in s or 'fishing' in s or 'animal' in s or 'pet' in s:
        return 'Raw/Resource'
    else:
        return 'Machine/Processed'
        
# Data Processing & Correction
# 1. Yield Mapping (Feed is set to 3.0 by your custom rule)
df['Yield_qty'] = 1.0
df.loc[df['Source'].astype(str).str.contains(r'Field|Tree|Bush', case=False, na=False), 'Yield_qty'] = 2.0
df.loc[df['Source'].astype(str).str.contains(r'Feed', case=False, na=False), 'Yield_qty'] = 3.0 # Your custom rule

# 2. CORE NORMALIZATION
df["name_norm"] = df["Name"].apply(norm)
df["Level_num"] = df["Level"].apply(parse_first_int)
df["XP"] = pd.to_numeric(df["XP"], errors="coerce").fillna(0.0)
df["Time"] = df["Time"].astype(str).str.strip()
df["time_min"] = df["Time"].apply(parse_time_to_minutes)

# 3. CRITICAL XP AND TIME STABILITY FIX
BASE_RESOURCES_XP_ZERO = ['Silver ore', 'Gold ore', 'Platinum ore', 'Coal', 'Iron ore']
NORMALIZED_ZERO_XP = [norm(n) for n in BASE_RESOURCES_XP_ZERO] # Get normalized names

for name in NORMALIZED_ZERO_XP:
    # Set XP to 0 to prevent exploitation
    df.loc[df['name_norm'] == name, 'XP'] = 0.0 
    # Set Time to a minimum non-zero value for LP stability
    df.loc[df['name_norm'] == name, 'time_min'] = 1.0 

# 4. NEEDS AND FINAL MAPS
df["needs_raw"] = df["Needs"].apply(parse_needs)
df["needs_norm"] = df.apply(clear_self_need, axis=1)
df['Building'] = df['Source'].apply(extract_building)
df['Production_Type'] = df['Source'].apply(get_production_type)


# --- FINAL EXPORT ---

FINAL_COLUMNS = [
    "Name", "name_norm", "Level", "Level_num", "Time", "time_min", "XP", "needs_norm", 
    "Source", "Building", "Production_Type", "Yield_qty"
]

df_final = df[FINAL_COLUMNS].copy()

In [29]:
df_final.head(10)

Unnamed: 0,Name,name_norm,Level,Level_num,Time,time_min,XP,needs_norm,Source,Building,Production_Type,Yield_qty
0,Wheat,wheat,1,1,2 min,2.0,1.0,{},Field (1st crop),field,Raw/Resource,2.0
1,Egg,egg,1,1,20 min,20.0,2.0,{'chicken feed': 1.0},Chicken (1st animal product),chicken,Raw/Resource,1.0
2,Corn,corn,2,2,5 min,5.0,1.0,{},Field (2nd crop),field,Raw/Resource,2.0
3,Bread,bread,2,2,5 min,5.0,3.0,{'wheat': 3.0},Bakery,bakery,Machine/Processed,1.0
4,Chicken feed,chicken feed,3,3,5 min,5.0,1.0,"{'corn': 1.0, 'wheat': 2.0}",Feed Mill,feed mill,Machine/Processed,3.0
5,Soybean,soybean,5,5,20 min,20.0,2.0,{},Field (3rd crop),field,Raw/Resource,2.0
6,Cow feed,cow feed,6,6,10 min,10.0,2.0,"{'corn': 1.0, 'soybean': 2.0}",Feed Mill,feed mill,Machine/Processed,3.0
7,Milk,milk,6,6,1 h,60.0,3.0,{'cow feed': 1.0},Cow (2nd animal product),cow,Raw/Resource,1.0
8,Cream,cream,6,6,20 min,20.0,6.0,{'milk': 1.0},Dairy,dairy,Machine/Processed,1.0
9,Sugarcane,sugarcane,7,7,30 min,30.0,3.0,{},Field (4th crop),field,Raw/Resource,2.0


In [30]:
# Changing data type of Level and XP to int
df["Level"]=df["Level"].astype(str).str.extract(r"(\d+)").astype("Int64")
df["Level"]=df["Level"].fillna(0).astype(int)
df["XP"]=df["XP"].astype(str).str.extract(r"(\d+)").astype("Int64")
df["XP"]=df["XP"].fillna(0).astype(int)
df.dtypes

Name                object
Level                int64
Max. price          object
Time                object
XP                   int64
Needs               object
Source              object
Per boat crate      object
Yield_qty          float64
name_norm           object
Level_num            int64
time_min           float64
needs_raw           object
needs_norm          object
Building            object
Production_Type     object
dtype: object

In [31]:
# Exporting as csv #
df_final.to_csv(r'/Users/madelinec/Downloads/Hay Day Project/goods.csv', index = False)