In [138]:
# ===============================
# IMPORTS
# ===============================
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
import json, time, re
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [121]:
# ===============================
# CATEGORY PAGES
# ===============================
CATEGORY_PAGES = [
    "https://applepakistan.com.pk/product-category/iphone/",
    "https://applepakistan.com.pk/product-category/ipad/",
    "https://applepakistan.com.pk/product-category/mac/",
    "https://applepakistan.com.pk/product-category/watch/",
]

In [122]:
# ===============================
# HELPERS
# ===============================
def infer_category(name):
    name = name.lower()
    if "iphone" in name:
        return "iPhone"
    elif "ipad" in name:
        return "iPad"
    elif "macbook" in name or "imac" in name or "mac " in name:
        return "Mac"
    elif "watch" in name:
        return "Watch"
    return None

def parse_storage(text):
    if not text:
        return 0
    text = text.lower()
    nums = re.findall(r"\d+", text)
    if not nums:
        return 0
    value = int(nums[0])
    return value * 1024 if "tb" in text else value

def extract_chip(name):
    match = re.search(r"(A\d+|M\d+)", name)
    return match.group(1) if match else "Unknown"

In [123]:
# ===============================
# START SELENIUM
# ===============================
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
all_links = set()

In [124]:
# ===============================
# COLLECT PRODUCT LINKS
# ===============================
for url in CATEGORY_PAGES:
    driver.get(url)
    time.sleep(6)
    for a in driver.find_elements(By.XPATH, "//a[contains(@href, '/product/')]"):
        href = a.get_attribute("href")
        if href:
            all_links.add(href)

rows = []

# ===============================
# SCRAPE PRODUCT PAGES
# ===============================
for link in all_links:
    driver.get(link)
    time.sleep(3)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    title = soup.find("h1")
    if not title:
        continue

    product = (
        title.text.replace("Buy", "")
        .replace("in Pakistan", "")
        .strip()
    )

    category = infer_category(product)
    if not category:
        continue

    chip = extract_chip(product)

    form = soup.find("form", class_="variations_form")
    if not form:
        continue

    variations = json.loads(form.get("data-product_variations", "[]"))

    for v in variations:
        price = v.get("display_price")
        if not price or int(price) == 0:
            continue

        attrs = v.get("attributes", {})
        storage = 0
        color = "Unknown"

        for k, val in attrs.items():
            k = k.lower()
            if "storage" in k or "rom" in k:
                storage = parse_storage(val)
            elif "color" in k and val:
                color = val.strip()

        spec_text = f"{product} {chip} {storage}GB {color}"

        rows.append([
            category,
            product,
            int(price),
            storage,
            color,
            chip,
            spec_text
        ])

driver.quit()

In [126]:
# ===============================
# CREATE CSV
# ===============================
df = pd.DataFrame(
    rows,
    columns=[
        "Category",
        "Product",
        "Price",
        "Storage_GB",
        "Color",
        "Chip",
        "Spec_Text"
    ]
)

df = df.drop_duplicates()
df.to_csv("apple_products_prices.csv", index=False)

print("✅ CSV CREATED")
print(df.groupby("Category").size())


✅ CSV CREATED
Category
Mac       31
Watch     15
iPad      36
iPhone    26
dtype: int64


In [2]:
import pandas as pd
import joblib
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

df = pd.read_csv("apple_products_prices.csv")

X = df[["Category", "Spec_Text", "Storage_GB"]]
y = df["Price"]

preprocessor = ColumnTransformer([
    ("text", TfidfVectorizer(), "Spec_Text"),
    ("category", TfidfVectorizer(), "Category"),
    ("num", StandardScaler(), ["Storage_GB"]),
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# ===============================
# LOAD DATA
# ===============================
df = pd.read_csv("apple_products_prices.csv")

X = df[["Category", "Spec_Text", "Storage_GB"]]
y = df["Price"]

# ===============================
# PREPROCESSING (SAME FOR ALL)
# ===============================
preprocessor = ColumnTransformer([
    ("text", TfidfVectorizer(), "Spec_Text"),
    ("category", TfidfVectorizer(), "Category"),
    ("num", StandardScaler(), ["Storage_GB"]),
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ===============================
# MODELS TO COMPARE
# ===============================
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.001),
    "Random Forest": RandomForestRegressor(
        n_estimators=300,
        random_state=42
    ),
    "Gradient Boosting": GradientBoostingRegressor(
        n_estimators=300,
        random_state=42
    )
}

# ===============================
# TRAIN & EVALUATE
# ===============================
results = []

for name, algo in models.items():
    model = Pipeline([
        ("prep", preprocessor),
        ("model", algo)
    ])

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    r2 = model.score(X_test, y_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

    results.append({
        "Model": name,
        "R2": round(r2, 4),
        "MAE": round(mae, 2),
        "RMSE": round(rmse, 2),
        "MAPE (%)": round(mape, 2)
    })

# ===============================
# RESULTS TABLE
# ===============================
results_df = pd.DataFrame(results)
results_df.sort_values(by="R2", ascending=False)


  model = cd_fast.sparse_enet_coordinate_descent(


Unnamed: 0,Model,R2,MAE,RMSE,MAPE (%)
1,Ridge Regression,0.7183,51301.57,58413.13,16.21
3,Random Forest,0.6979,49539.87,60492.36,14.6
4,Gradient Boosting,0.64,53239.85,66035.85,15.61
2,Lasso Regression,0.4504,61304.74,81594.55,17.4
0,Linear Regression,-0.056,80486.88,113104.84,25.66


In [2]:
# ===============================
# TRAIN FINAL MODEL AGAIN
# ===============================
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import joblib

final_model = Pipeline([
    ("prep", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=800,
        max_depth=25,
        min_samples_leaf=2,
        random_state=42
    ))
])

final_model.fit(X_train, y_train)

# SAVE THE FITTED PIPELINE
joblib.dump(final_model, "apple_price_model.pkl")

print("✅ Fitted model saved correctly")


✅ Fitted model saved correctly


In [4]:
# Predict on test set (log scale)
y_pred_log = final_model.predict(X_test)

# Convert back to actual prices
y_test_actual = y_test
y_pred_actual = y_pred_log

comparison_df = pd.DataFrame({
    "Actual_Price": y_test_actual.astype(int),
    "Predicted_Price": y_pred_actual.astype(int),
    "Error": (y_test_actual - y_pred_actual).astype(int)
})

# Display only (no CSV)
print("Sample Actual vs Predicted:")
print(comparison_df.head(10))

Sample Actual vs Predicted:
    Actual_Price  Predicted_Price   Error
77        439999           378478   61520
10        289999           245434   44564
4         170000           190526  -20526
83        489999           374991  115007
62        340000           422457  -82457
67        385999           374991   11007
30        260000           289455  -29455
45        300000           291921    8078
95        539999           594137  -54138
11        268499           245434   23064
