In [1]:
import time
import re
from collections import Counter

import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Display DataFrames nicely in the notebook
from IPython.display import display

In [4]:
# --- Configuration ---
# Set your file paths and parameters here instead of using command-line args

# TODO: Update this path to point to your "IMDB Dataset.csv" file
INPUT_FILE = "IMDB Dataset.csv" 

NUM_ROWS = 20000
OUTPUT_FILE = "seq_output.csv"
TEXT_COLUMN = "review"

In [5]:
# --- Helper Functions & Constants ---

STOPWORDS = set(ENGLISH_STOP_WORDS)
HTML_TAG_RE = re.compile(r"<[^>]+>")
NON_ALPHA_RE = re.compile(r"[^a-z]+")  # keep only a-z

def tokenize_clean(text: str):
    """
    Cleans and tokenizes a single text string.
    """
    if not isinstance(text, str):
        return []
    text = text.lower()
    text = HTML_TAG_RE.sub(" ", text)
    text = NON_ALPHA_RE.sub(" ", text)  # remove anything not a-z
    tokens = [t for t in text.split() if t and t not in STOPWORDS and len(t) > 1]
    return tokens

In [6]:
# --- Main Processing ---

print("Starting analysis...")
start = time.perf_counter()

# Load the dataset
try:
    df = pd.read_csv(INPUT_FILE, nrows=NUM_ROWS)
except FileNotFoundError:
    print(f"Error: Input file not found at '{INPUT_FILE}'")
    print("Please update the INPUT_FILE variable in Cell 3.")
    # Stop execution if file isn't found
    raise

if TEXT_COLUMN not in df.columns:
    raise ValueError(f"Column '{TEXT_COLUMN}' not found in CSV. Found columns: {list(df.columns)}")

print(f"Loaded {len(df)} rows. Showing first 5:")
display(df.head())

# Get the text data
texts = df[TEXT_COLUMN].astype(str).tolist()
n_reviews = len(texts)

# Process all texts
print(f"\nProcessing {n_reviews} reviews...")
counter = Counter()
for t in texts:
    counter.update(tokenize_clean(t))

# Get top 20
top_20 = counter.most_common(20)

# Save results to CSV
out_df = pd.DataFrame(top_20, columns=["word", "count"])
out_df.to_csv(OUTPUT_FILE, index=False)

elapsed = time.perf_counter() - start

print(f"\n--- Analysis Complete ---")
print(f"Processed {n_reviews} reviews in {elapsed:.1f} seconds.")
print(f"Saved top 20 words to '{OUTPUT_FILE}'")

Starting analysis...
Loaded 20000 rows. Showing first 5:


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive



Processing 20000 reviews...

--- Analysis Complete ---
Processed 20000 reviews in 10.9 seconds.
Saved top 20 words to 'seq_output.csv'


In [7]:
# --- Display Results ---

print("Top 20 most frequent words:")
# Display the DataFrame directly in the notebook
display(out_df)

# Print the summary string
top_str = ", ".join(f"{w}({c})" for w, c in top_20)
print(f"\nSummary string: {top_str}")

Top 20 most frequent words:


Unnamed: 0,word,count
0,movie,35482
1,film,31772
2,like,16107
3,just,14173
4,good,12009
5,time,10079
6,story,9308
7,really,9191
8,bad,7489
9,people,7289



Summary string: movie(35482), film(31772), like(16107), just(14173), good(12009), time(10079), story(9308), really(9191), bad(7489), people(7289), great(7225), don(7062), make(6449), way(6339), movies(6093), characters(5707), think(5630), films(5516), character(5506), watch(5499)
