In [1]:
import re, zipfile, urllib.request
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score


# Fetch & Extract Dataset
url = "http://www.cs.cornell.edu/people/pabo/movie-review-data/mix20_rand700_tokens_cleaned.zip"
zip_file = Path("mix20_rand700_tokens_cleaned.zip")
data_dir = Path(zip_file.stem)

print("Downloading dataset...")
urllib.request.urlretrieve(url, zip_file)

with zipfile.ZipFile(zip_file, "r") as zf:
  zf.extractall(data_dir)


pos_path = data_dir / "tokens" / "pos"
neg_path = data_dir / "tokens" / "neg"
dataset_version = "Polarity v0.9/v1.0 â€“ 700 pos / 700 neg"


#Load Data

def read_reviews(pos_dir, neg_dir):
    docs, y = [], []
    for lbl, folder in [(1, pos_dir), (0, neg_dir)]:
        for file in Path(folder).glob("*.txt"):
            text = file.read_text(encoding="utf-8", errors="ignore")
            # strip ratings like "10/10" or "****"
            text = re.sub(r"\d+/\d+|\*+", "", text)
            docs.append(text)
            y.append(lbl)
    return docs, np.array(y)

reviews, labels = read_reviews(pos_path, neg_path)
print(f"\n Loaded {len(reviews)} reviews "
      f"({labels.sum()} positive / {(labels==0).sum()} negative)")

#Evaluation Helper
def cv_score(model, X, y, folds=3):
    return cross_val_score(model, X, y, cv=folds).mean() * 100

#Feature Experiments
experiments = []
token_rule = r"(?u)\b\w+\b"

# (1) Unigrams (counts)
vec = CountVectorizer(binary=False, token_pattern=token_rule, min_df=4)
X = vec.fit_transform(reviews)
experiments.append(["(1) Unigrams freq", X.shape[1],
                    cv_score(MultinomialNB(), X, labels),
                    None,
                    cv_score(LinearSVC(max_iter=5000), X, labels)])

# (2) Unigrams (binary presence)
vec = CountVectorizer(binary=True, token_pattern=token_rule, min_df=4)
X = vec.fit_transform(reviews)
experiments.append(["(2) Unigrams presence", X.shape[1],
                    cv_score(MultinomialNB(), X, labels),
                    cv_score(LogisticRegression(max_iter=1000), X, labels),
                    cv_score(LinearSVC(max_iter=5000), X, labels)])

#  (3) Bigrams only
vec = CountVectorizer(binary=True, ngram_range=(2,2),
                      token_pattern=token_rule, min_df=7)
X = vec.fit_transform(reviews)
experiments.append(["(4) Bigrams only", X.shape[1],
                    cv_score(MultinomialNB(), X, labels),
                    cv_score(LogisticRegression(max_iter=1000), X, labels),
                    cv_score(LinearSVC(max_iter=5000), X, labels)])
#(4) Unigrams + Bigrams
vec = CountVectorizer(binary=True, ngram_range=(1,2),
                      token_pattern=token_rule, min_df=7)
X = vec.fit_transform(reviews)
experiments.append(["(3) Unigrams+Bigrams", X.shape[1],
                    cv_score(MultinomialNB(), X, labels),
                    cv_score(LogisticRegression(max_iter=1000), X, labels),
                    cv_score(LinearSVC(max_iter=5000), X, labels)])


# (5) Adjective-based tokens
def adjective_filter(text):
    words = re.findall(r"\b\w+\b", text)
    pattern = r"(ly$|ous$|ful$|able$|ive$|less$|ic$|al$|est$|er$)"
    keywords = {"good", "bad", "great", "awful", "excellent", "poor"}
    return [w for w in words if re.search(pattern, w.lower()) or w.lower() in keywords]

vec = CountVectorizer(tokenizer=adjective_filter, binary=True, min_df=4)
X = vec.fit_transform(reviews)
experiments.append(["(5) Adjectives only", X.shape[1],
                    cv_score(MultinomialNB(), X, labels),
                    cv_score(LogisticRegression(max_iter=1000), X, labels),
                    cv_score(LinearSVC(max_iter=5000), X, labels)])

# (6) Top 2633 unigrams
vec = CountVectorizer(binary=True, token_pattern=token_rule, max_features=2633)
X = vec.fit_transform(reviews)
experiments.append(["(6) Top 2633 unigrams", X.shape[1],
                    cv_score(MultinomialNB(), X, labels),
                    cv_score(LogisticRegression(max_iter=1000), X, labels),
                    cv_score(LinearSVC(max_iter=5000), X, labels)])

#Printing Results
results_df = pd.DataFrame(experiments, columns=["Features", "#Features", "NaiveBayes", "LogReg", "SVM"])
print("\nModel Metrics:\n")
print(results_df.to_string(index=False))


Downloading dataset...

 Loaded 1400 reviews (700 positive / 700 negative)





Model Metrics:

             Features  #Features  NaiveBayes    LogReg       SVM
    (1) Unigrams freq      12960   78.713856       NaN 77.929162
(2) Unigrams presence      12960   81.356511 83.142483 81.642634
     (4) Bigrams only      15825   79.285948 77.857324 75.572016
 (3) Unigrams+Bigrams      24462   81.713552 82.571155 80.999776
  (5) Adjectives only       1925   76.069975 73.284411 69.927520
(6) Top 2633 unigrams       2633   80.855183 80.712734 78.427273
