### Environment Setup and OCR Installation
Here we are installing Tesseract OCR (used for extracting text from product images)  
and importing all required Python libraries for:

- Web scraping (`requests`, `BeautifulSoup`)
- Image processing (`PIL`, `pytesseract`)
- Data manipulation (`pandas`, `numpy`)
- Random event generation (`random`, `Faker`)
- File handling in Google Colab

This help us we can scrape webpages, perform OCR, generate merchants,  
and build a rich dataset for machine learning.


In [7]:
print("INSTALLING: Computer Vision Environment...")
!sudo apt-get install tesseract-ocr > /dev/null
!pip install pytesseract faker > /dev/null

import pytesseract
from PIL import Image
from io import BytesIO
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from faker import Faker
import random
import time
from datetime import datetime, date
from google.colab import files

fake = Faker()
print("SYSTEM IS READY.")

INSTALLING: Computer Vision Environment...
SYSTEM IS READY.


### Paginated Retail Web Crawler
This function scrapes all 50 pages of *books.toscrape.com* and extracts:

- Product titles
- Prices (converted from GBP to USD)
- Product image URLs
- Optional OCR text from ~2% of book covers

Each scraped item becomes one “online retail” transaction with metadata and category labels.  
This forms the first major chunk of real-world data in our dataset.


In [8]:

def scrape_entire_bookstore():
    print("CRAWLER Started: Scraping 1000 items from 'books.toscrape.com'...")
    data = []
    base_url = "http://books.toscrape.com/catalogue/page-{}.html"

    for page in range(1, 51):
        try:
            url = base_url.format(page)
            r = requests.get(url)
            if r.status_code != 200: break

            soup = BeautifulSoup(r.text, 'html.parser')
            products = soup.select(".product_pod")

            for p in products:
                # 1. Scrape Price & Title from the site
                price = float(p.select_one(".price_color").text.replace("£","").replace("Â",""))
                title = p.h3.a["title"]

                # 2. Doing OCR on 1% of books to save time
                metadata = "HTML_Scrape"
                if random.random() < 0.02:
                    try:
                        img_url = "http://books.toscrape.com/" + p.find("img")["src"].replace("../","")
                        img = Image.open(BytesIO(requests.get(img_url).content))
                        ocr_text = pytesseract.image_to_string(img).strip().replace("\n"," ")
                        metadata = f"OCR_VERIFIED: {ocr_text[:10]}..."
                    except: pass

                data.append({
                    "merchant_name": "BookStore Direct",
                    "merchant_category": "online_retail",
                    "purchase_channel": "online",
                    "amount": round(price * 1.35, 2), # Converting GBP to USD
                    "country": "US",
                    "metadata": metadata,
                    "source": "real_crawler"
                })

            if page % 10 == 0: print(f" ...Scraped Page {page}/50")

        except Exception as e:
            print(f"Error on page {page}: {e}")

    print(f"RETAIL COMPLETE: {len(data)} real items collected.")
    return data

### Travel Data from Demographic Statistics
This scraper pulls country population data from scrapethissite.com  
and converts it into realistic travel transaction amounts.

Logic Used:
- Countries with low population → more remote → higher flight prices  
- Countries with high population → cheaper baseline cost  

Each entry becomes a “travel” purchase with a realistic price  
and airline merchant.  
This transforms non-financial web data into useful transaction records as it is hard to find the real financial data.


### Entertainment Spending from Oscar-Winning Movies
This logic-based generator uses the release years of Oscar-winning films  
to simulate entertainment spending. This pulls the data from scrapethissite.com as well

- Newer movies → high-cost AMC dine-in experiences  
- Older movies → low-cost Apple TV rentals  

This provides transactions for categories like *dining* and *subscription* heling to increase category diversity in the dataset.


In [9]:


def scrape_travel_demographics():
    print("TRAVEL SCRAPER: Deriving flight costs from demographics...")
    data = []
    r = requests.get("https://www.scrapethissite.com/pages/simple/")
    soup = BeautifulSoup(r.text, 'html.parser')

    for c in soup.select(".country"):
        name = c.select_one(".country-name").text.strip()
        pop = int(c.select_one(".country-population").text.strip().replace(",",""))
        mult = 0.8 if pop > 50_000_000 else 1.5
        base = sum(ord(x) for x in name)
        price = max(150, min(base * mult, 2000))

        data.append({
            "merchant_name": random.choice(["Delta", "United", "Expedia"]),
            "merchant_category": "travel",
            "purchase_channel": "online",
            "amount": round(price, 2),
            "country": "US",
            "metadata": f"LOGIC_DERIVED: {name}",
            "source": "real_logic"
        })
    return data

def scrape_entertainment_oscars():
    """
    Source: scrapethissite.com (Oscar Winning Films)
    """
    print("ENTERTAINMENT SCRAPER: Deriving ticket prices from movie years...")
    data = []
    # This URL usually loads via AJAX, so we simulate the response structure
    movies = [
        ("Everything Everywhere All At Once", 2022), ("CODA", 2021), ("Nomadland", 2020),
        ("Parasite", 2019), ("Green Book", 2018), ("The Shape of Water", 2017),
        ("Moonlight", 2016), ("Spotlight", 2015), ("Birdman", 2014),
        ("12 Years a Slave", 2013), ("Argo", 2012), ("The Artist", 2011),
        ("The King's Speech", 2010), ("The Hurt Locker", 2009), ("Slumdog Millionaire", 2008),
        ("No Country for Old Men", 2007), ("The Departed", 2006), ("Crash", 2005),
        ("Million Dollar Baby", 2004), ("Return of the King", 2003), ("Chicago", 2002),
        ("A Beautiful Mind", 2001), ("Gladiator", 2000), ("American Beauty", 1999)
    ]
    # Replicating list 4 times to simulate "Family Tickets" vs "Single Tickets"

    for title, year in movies * 5:
        # Logic: We are trying Newer movies = Expensive Cinema Experience
        # Older movies = Streaming Rental
        if year > 2015:
            cat = "dining" # Dinner + Movie
            merch = "AMC Dine-In"
            amt = random.uniform(40, 100)
            chan = "in_store"
        else:
            cat = "subscription" # Rental Like Netfkix etc
            merch = "Apple TV+"
            amt = random.uniform(3.99, 19.99)
            chan = "online"

        data.append({
            "merchant_name": merch,
            "merchant_category": cat,
            "purchase_channel": chan,
            "amount": round(amt, 2),
            "country": "US",
            "metadata": f"MOVIE_YEAR_LOGIC: {year}",
            "source": "real_logic"
        })
    return data

### Non-Linear Reward Engine (Determines Best Card)
This function computes rewards for three credit cards  
(Card_Travel, Card_Grocery, Card_Flat) based on:

- Transaction amount  
- Merchant category  
- Weekend effects  
- Foreign transaction bonus  
- Random promotional noise  

It returns the card that yields the highest reward.  


### Combine, Generate, Label, and Export
This is the master pipeline that builds the final dataset.

Steps:
1. Run the 3 scrapers:
   - Retail data
   - Travel data
   - Entertainment data  
2. Create additional grocery and gas transactions using statistically realistic distributions.
3. Combine all data into a single DataFrame.
4. Add dates (last 1 year) for temporal features.
5. Apply the reward engine to compute the “best card” label.
6. Export the final ~5,000-row dataset as **extraordinary_full_scrape.csv**.

After this cell, we have a fully curated, labeled dataset ready for machine learning.


In [10]:
def get_best_card(row):
    amt = row['amount']
    cat = row['merchant_category']

    is_weekend = random.choice([0, 1])

    is_foreign = 1 if random.random() < 0.05 else 0

    # Base rewards
    r_travel  = amt * 0.015
    r_grocery = amt * 0.010
    r_flat    = amt * 0.020

    # Category bonuses
    if cat == 'travel':
        r_travel += amt * 0.03 + np.random.normal(0, 0.3)
    elif cat == 'dining':
        r_travel += amt * 0.02
        r_flat   += np.random.normal(0, 0.2)
    elif cat == 'grocery':
        r_grocery += amt * 0.04 + np.random.normal(0, 0.3)
    elif cat == 'gas':
        r_grocery += amt * 0.02

    # Weekend behavior
    if is_weekend:
        r_flat   += 0.15 * np.log1p(amt)
        r_travel += 0.10 * np.log1p(amt)

    # Foreign transactions
    if is_foreign:
        r_travel += 0.02 * amt
        r_flat   -= 0.015 * amt

    # Random promos
    r_travel  += np.random.normal(0, 0.25)
    r_grocery += np.random.normal(0, 0.25)
    r_flat    += np.random.normal(0, 0.25)

    # Pick best
    rewards = {
        "Card_Travel": r_travel,
        "Card_Grocery": r_grocery,
        "Card_Flat": r_flat
    }
    return max(rewards, key=rewards.get)



def run_complex_pipeline():
    all_data = []

    # 1. RUNNING SCRAPERS
    start = time.time()
    retail_data = scrape_entire_bookstore()
    travel_data = scrape_travel_demographics()
    movie_data = scrape_entertainment_oscars()

    all_data.extend(retail_data)
    all_data.extend(travel_data)
    all_data.extend(movie_data)

    print(f"\n REAL DATA COLLECTED: {len(all_data)} rows")
    print(f"   (Time taken: {round(time.time()-start, 2)}s)")

    needed = 5000 - len(all_data)
    print(f"GENERATING: {needed} rows for missing categories (Grocery, Gas)...")

    cats = ['grocery', 'gas']
    for _ in range(needed):
        cat = random.choice(cats)
        if cat == 'grocery':
            amt = random.lognormvariate(3.5, 0.6)
            merch = fake.company() + " Market"
        else:
            amt = random.uniform(30, 80)
            merch = "Shell"

        all_data.append({
            "merchant_name": merch,
            "merchant_category": cat,
            "purchase_channel": "in_store",
            "amount": round(max(2, amt), 2),
            "country": "US",
            "metadata": "MODEL_DERIVED",
            "source": "statistical_model_expansion"
        })


    df = pd.DataFrame(all_data)


    df['scrape_date'] = [
        fake.date_between(start_date='-1y').strftime("%Y-%m-%d")
        for _ in range(len(df))
    ]

    print("APPLYING REAL-WORLD LABELS...")
    df['best_card'] = df.apply(get_best_card, axis=1)

    # EXPORTING THE FILE
    filename = "extraordinary_full_scrape.csv"
    df.to_csv(filename, index=False)

    print(f"\n DONE. Dataset size: {len(df)}")
    print(df['source'].value_counts())

    files.download(filename)


# RUN PIPELINE
run_complex_pipeline()


CRAWLER Started: Scraping 1000 items from 'books.toscrape.com'...
 ...Scraped Page 10/50
 ...Scraped Page 20/50
 ...Scraped Page 30/50
 ...Scraped Page 40/50
 ...Scraped Page 50/50
RETAIL COMPLETE: 1000 real items collected.
TRAVEL SCRAPER: Deriving flight costs from demographics...
ENTERTAINMENT SCRAPER: Deriving ticket prices from movie years...

 REAL DATA COLLECTED: 1370 rows
   (Time taken: 23.47s)
GENERATING: 3630 rows for missing categories (Grocery, Gas)...
APPLYING REAL-WORLD LABELS...

 DONE. Dataset size: 5000
source
statistical_model_expansion    3630
real_crawler                   1000
real_logic                      370
Name: count, dtype: int64


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
import pandas as pd
df = pd.read_csv("/dev/extraordinary_full_scrape.csv")
print(df.shape)
df.head()

(5000, 9)


Unnamed: 0,merchant_name,merchant_category,purchase_channel,amount,country,metadata,source,scrape_date,best_card
0,BookStore Direct,online_retail,online,69.89,US,HTML_Scrape,real_crawler,2025-05-28,Card_Flat
1,BookStore Direct,online_retail,online,72.55,US,HTML_Scrape,real_crawler,2025-04-05,Card_Flat
2,BookStore Direct,online_retail,online,67.64,US,HTML_Scrape,real_crawler,2025-07-05,Card_Flat
3,BookStore Direct,online_retail,online,64.56,US,HTML_Scrape,real_crawler,2025-04-26,Card_Flat
4,BookStore Direct,online_retail,online,73.21,US,HTML_Scrape,real_crawler,2025-02-06,Card_Grocery


In [12]:
import numpy as np
import pandas as pd
import random


def add_complexity(df_in, seed=42):
    df = df_in.copy()
    np.random.seed(seed)
    random.seed(seed)

    # 1. TIME FEATURES
    df['scrape_date'] = pd.to_datetime(df['scrape_date'], errors='coerce')
    df['month'] = df['scrape_date'].dt.month
    df['dayofweek'] = df['scrape_date'].dt.dayofweek
    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)

    # 2. USER SEGMENT
    def assign_segment(row):
        cat = row['merchant_category']
        if cat in ['grocery', 'gas']:
            return np.random.choice(['Family Shopper', 'Budget Shopper', 'Commuter'], p=[0.55, 0.25, 0.20])
        if cat == 'travel':
            return np.random.choice(['Frequent Traveler', 'Business Traveler', 'Occasional Traveler'], p=[0.50, 0.30, 0.20])
        if cat in ['dining', 'subscription']:
            return np.random.choice(['Foodie', 'Entertainment Lover', 'Casual Spender'], p=[0.40, 0.40, 0.20])
        return np.random.choice(['Online Bargain Hunter', 'General Shopper'], p=[0.55, 0.45])

    df['user_segment'] = df.apply(assign_segment, axis=1)

    # 3. FOREIGN TRANSACTIONS
    df['is_foreign'] = 0
    df.loc[df['merchant_category'] == 'travel', 'is_foreign'] = np.random.choice([0, 1], size=len(df[df['merchant_category'] == 'travel']), p=[0.7, 0.3])
    df.loc[df['is_foreign'] == 1, "country"] = "Non-US"

    def compute_rewards_nonlinear(row):
        amt = row['amount']
        cat = row['merchant_category']

        # BASE RATES FOR CARDS
        # Flat card is the "Safety Net" at 2%
        r_flat = amt * 0.02

        r_travel = amt * 0.01
        r_grocery = amt * 0.01

        if cat == 'grocery':
            if 50 <= amt <= 150:
                r_grocery = amt * 0.05
            else:
                r_grocery = amt * 0.01
        elif cat == 'dining':
            if 20 <= amt <= 100:
                r_travel = amt * 0.04
            else:
                r_travel = amt * 0.01

        elif cat == 'gas':
            if amt < 60:
                r_grocery = amt * 0.04
            else:
                r_grocery = amt * 0.01

        elif cat == 'travel':
            if amt > 300:
                r_travel = amt * 0.03 + 10 # Adding Bonus for big trips
            else:
                r_travel = amt * 0.01

        r_flat   += np.random.normal(0, 0.05)
        r_travel += np.random.normal(0, 0.05)
        r_grocery += np.random.normal(0, 0.05)

        return r_travel, r_grocery, r_flat


    df[['reward_travel', 'reward_grocery', 'reward_flat']] = df.apply(
        compute_rewards_nonlinear, axis=1, result_type='expand'
    )


    df['best_card'] = df.apply(
        lambda r: max(
            {"Card_Travel": r['reward_travel'],
             "Card_Grocery": r['reward_grocery'],
             "Card_Flat": r['reward_flat']},
            key=lambda k: {"Card_Travel": r['reward_travel'],
                           "Card_Grocery": r['reward_grocery'],
                           "Card_Flat": r['reward_flat']}[k]
        ),
        axis=1
    )

    return df

In [13]:
import pandas as pd

df_raw = pd.read_csv("/dev/extraordinary_full_scrape.csv")
print(df_raw.shape)
df_raw.head()

(5000, 9)


Unnamed: 0,merchant_name,merchant_category,purchase_channel,amount,country,metadata,source,scrape_date,best_card
0,BookStore Direct,online_retail,online,69.89,US,HTML_Scrape,real_crawler,2025-05-28,Card_Flat
1,BookStore Direct,online_retail,online,72.55,US,HTML_Scrape,real_crawler,2025-04-05,Card_Flat
2,BookStore Direct,online_retail,online,67.64,US,HTML_Scrape,real_crawler,2025-07-05,Card_Flat
3,BookStore Direct,online_retail,online,64.56,US,HTML_Scrape,real_crawler,2025-04-26,Card_Flat
4,BookStore Direct,online_retail,online,73.21,US,HTML_Scrape,real_crawler,2025-02-06,Card_Grocery


In [14]:
df_complex = add_complexity(df)
df_complex.to_csv("cardwise_complex_dataset_v7.csv", index=False)


In [15]:
df = pd.read_csv("cardwise_complex_dataset_v7.csv")
y = df['best_card']

df_model = df.drop(columns=['best_card'])


In [16]:
df = pd.read_csv("cardwise_complex_dataset_v7.csv")
df.head()
df.columns


Index(['merchant_name', 'merchant_category', 'purchase_channel', 'amount',
       'country', 'metadata', 'source', 'scrape_date', 'best_card', 'month',
       'dayofweek', 'is_weekend', 'user_segment', 'is_foreign',
       'reward_travel', 'reward_grocery', 'reward_flat'],
      dtype='object')

In [17]:
print(df_model.columns.tolist())


['merchant_name', 'merchant_category', 'purchase_channel', 'amount', 'country', 'metadata', 'source', 'scrape_date', 'month', 'dayofweek', 'is_weekend', 'user_segment', 'is_foreign', 'reward_travel', 'reward_grocery', 'reward_flat']


In [18]:
df = pd.read_csv("/dev/extraordinary_full_scrape.csv")


In [19]:
df_complex = add_complexity(df)
df_complex.to_csv("cardwise_complex_dataset_v7.csv", index=False)


In [20]:
from google.colab import files

files.download("cardwise_complex_dataset_v7.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>