In [4]:
import re
import time
import csv
import pandas as pd
from collections import Counter
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
from google.colab import drive
from IPython.display import display # Good practice to import display

drive.mount('/content/drive')
# -----------------------------------------------------
# 2️⃣  Helper functions
# -----------------------------------------------------

WORD_RE = re.compile(r"\b[a-zA-Z']+\b")

def clean_and_tokenize(text: str):
    """Lowercase and tokenize into simple alphabetic words."""
    if not text:
        return []
    return WORD_RE.findall(text.lower())

def process_chunk(lines):
    """Process one chunk of reviews and return local Counter."""
    local_counter = Counter()
    for text in lines:
        local_counter.update(clean_and_tokenize(text))
    return local_counter

def chunkify(lst, n):
    """Split a list into n nearly equal chunks."""
    k, m = divmod(len(lst), n)
    chunks = []
    start = 0
    for i in range(n):
        size = k + (1 if i < m else 0)
        chunks.append(lst[start:start+size])
        start += size
    return chunks

def run_sequential(reviews):
    """Run single-threaded baseline."""
    t0 = time.time()
    counter = process_chunk(reviews)
    t1 = time.time()
    return counter, t1 - t0

def run_parallel(reviews, n_workers):
    """Run multi-process text counting."""
    chunks = chunkify(reviews, n_workers)
    t0 = time.time()
    with Pool(processes=n_workers) as p:
        results = list(tqdm(p.imap(process_chunk, chunks), total=len(chunks),
                            desc=f"{n_workers} workers"))
    total = Counter()
    for r in results:
        total.update(r)
    t1 = time.time()
    return total, t1 - t0

# -----------------------------------------------------
# 3️⃣  Load dataset
# -----------------------------------------------------
# Note: Make sure '/content/IMDB Dataset.csv' is the correct path
# after mounting your drive. You might need to adjust this to
# '/content/drive/My Drive/path/to/IMDB Dataset.csv'
try:
    df = pd.read_csv("/content/IMDB Dataset.csv")
    reviews = df['review'].astype(str).tolist()
    print(f"📊 Loaded {len(reviews):,} reviews")
except FileNotFoundError:
    print("❌ Error: 'IMDB Dataset.csv' not found at '/content/'.")
    print("Please make sure the path is correct, or upload the file.")
    reviews = [] # Set to empty list to avoid further errors

if reviews:
    # -----------------------------------------------------
    # 4️⃣  Sequential baseline
    # -----------------------------------------------------
    print("\n⚙️ Running sequential baseline...")
    seq_counter, seq_time = run_sequential(reviews)
    print(f"✅ Sequential processing time: {seq_time:.3f} s")

    # -----------------------------------------------------
    # 5️⃣  Parallel runs
    # -----------------------------------------------------
    results = []
    results.append({"workers": 1, "time": seq_time, "speedup": 1.0, "efficiency": 100.0})

    # Use a sensible number of workers, e.g., up to the number of CPUs
    max_workers = cpu_count()
    print(f"🖥️ Detected {max_workers} CPUs")

    # Let's test 2, 4, and 8, but only if <= max_workers
    for w in [2, 4, 8]:
        if w <= max_workers:
            print(f"\n🚀 Running with {w} workers...")
            counter, t = run_parallel(reviews, w)
            speedup = seq_time / t
            efficiency = 100.0 * speedup / w
            results.append({
                "workers": w,
                "time": t,
                "speedup": speedup,
                "efficiency": efficiency
            })
        else:
            print(f"\nℹ️ Skipping {w} workers (more than available CPUs).")


    # -----------------------------------------------------
    # 6️⃣  Display results table
    # -----------------------------------------------------
    print("\n📈 Performance Results:")
    res_df = pd.DataFrame(results)

    # --- THIS IS THE FIX ---
    # .hide_index() was removed in pandas 2.0. Use .hide(axis="index") instead.
    display(res_df.style.format({
        "time": "{:.3f}",
        "speedup": "{:.2f}",
        "efficiency": "{:.1f}"
    }).hide(axis="index"))
    # --- END OF FIX ---

    # -----------------------------------------------------
    # 7️⃣  Show top 15 most common words
    # -----------------------------------------------------
    print("\n📝 Top 15 words (sequential run):")
    for word, count in seq_counter.most_common(15):
        print(f"{word:<15} {count}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
📊 Loaded 50,000 reviews

⚙️ Running sequential baseline...
✅ Sequential processing time: 7.053 s
🖥️ Detected 2 CPUs

🚀 Running with 2 workers...


2 workers: 100%|██████████| 2/2 [00:05<00:00,  2.71s/it]



ℹ️ Skipping 4 workers (more than available CPUs).

ℹ️ Skipping 8 workers (more than available CPUs).

📈 Performance Results:


workers,time,speedup,efficiency
1,7.053,1.0,100.0
2,5.614,1.26,62.8



📝 Top 15 words (sequential run):
the             667984
and             324440
a               322940
of              289407
to              268108
is              211075
br              201951
in              186770
it              157026
i               155108
this            150992
that            137061
was             95594
as              91748
for             87471
