# Sentiment Analysis on IMDB Reviews

This notebook demonstrates sentiment analysis on IMDB movie reviews using both classic and TF-IDF/n-gram features, multiple classifiers, and detailed evaluation and visualization. The workflow includes data download, preprocessing, feature engineering, model training, evaluation, and result visualization.

## 1. Import Required Libraries
Import all necessary libraries for data processing, feature extraction, modeling, and visualization.

In [None]:
import os
import re
import glob
import tarfile
import urllib.request
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from itertools import combinations
from scipy.sparse import csr_matrix
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import html
import warnings
warnings.filterwarnings('ignore')

## 2. Download and Extract IMDB Data
Download the IMDB dataset and extract it for use in sentiment analysis.

In [None]:
def download_data():
    url = 'https://www.dropbox.com/s/8oehplrobcgi9cq/imdb.tgz?dl=1'
    if not os.path.exists('imdb.tgz'):
        print('Downloading dataset...')
        urllib.request.urlretrieve(url, 'imdb.tgz')
    else:
        print('imdb.tgz already exists.')
    if not os.path.exists('data'):
        print('Extracting dataset...')
        tar = tarfile.open('imdb.tgz')
        tar.extractall()
        tar.close()
    else:
        print('Data directory already exists.')

# Download and extract data if needed

# Uncomment the line below to run in notebook
# download_data()

## 3. Read and Preprocess Data
Read the training and test data, clean the text, and prepare the labels for modeling.

In [None]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = html.unescape(text)
    text = re.sub(r'[\U00010000-\U0010ffff]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def read_data(path):
    fnames = sorted([f for f in glob.glob(os.path.join(path, 'pos', '*.txt'))])
    data = [(1, clean_text(open(f).readlines()[0])) for f in sorted(fnames)]
    fnames = sorted([f for f in glob.glob(os.path.join(path, 'neg', '*.txt'))])
    data += [(0, clean_text(open(f).readlines()[0])) for f in sorted(fnames)]
    data = sorted(data, key=lambda x: x[1])
    return np.array([d[1] for d in data]), np.array([d[0] for d in data])

# Uncomment to download and extract data
# download_data()

# Load training and test data
docs_train, labels_train = read_data(os.path.join('data', 'train'))
docs_test, labels_test = read_data(os.path.join('data', 'test'))
print(f"Loaded {len(docs_train)} training and {len(docs_test)} test documents.")

## 4. Tokenization and Feature Extraction Functions
Define and demonstrate the tokenization and feature extraction functions used for classic feature engineering.

In [None]:
def tokenize(doc, keep_internal_punct=False):
    doc = doc.lower()
    if not keep_internal_punct:
        return np.array(re.sub(r'\W+', ' ', doc).split())
    else:
        return np.array([re.sub(r'^\W+|\W+$', '', t) for t in doc.split()])

def token_features(tokens, feats):
    for t in tokens:
        feats[f"token={t}"] += 1

def token_pair_features(tokens, feats, k=3):
    for i in range(len(tokens) - k + 1):
        window = tokens[i:i+k]
        for a, b in combinations(window, 2):
            feats[f"token_pair={a}__{b}"] += 1

neg_words = set(['bad', 'hate', 'horrible', 'worst', 'boring'])
pos_words = set(['awesome', 'amazing', 'best', 'good', 'great', 'love', 'wonderful'])
def lexicon_features(tokens, feats):
    feats['neg_words'] = 0
    feats['pos_words'] = 0
    for t in tokens:
        if t.lower() in neg_words:
            feats['neg_words'] += 1
        if t.lower() in pos_words:
            feats['pos_words'] += 1

# Demonstrate on a sample document
sample_doc = "I LOVE this great movie, but the ending was bad."
tokens = tokenize(sample_doc)
feats = defaultdict(int)
token_features(tokens, feats)
token_pair_features(tokens, feats)
lexicon_features(tokens, feats)
print("Tokens:", tokens)
print("Features:", dict(feats))

## 5. Vectorization of Documents
Convert tokenized documents into sparse feature matrices using the defined feature extraction functions.

In [None]:
def featurize(tokens, feature_fns):
    feats = defaultdict(int)
    for fn in feature_fns:
        fn(tokens, feats)
    return sorted(feats.items())

def vectorize(tokens_list, feature_fns, min_freq, vocab=None):
    feats_count = defaultdict(int)
    feats_per_doc = []
    for tokens in tokens_list:
        feats = featurize(tokens, feature_fns)
        feats_per_doc.append(feats)
        for f, c in feats:
            feats_count[f] += 1
    if vocab is None:
        vocab = {f: i for i, f in enumerate(sorted([f for f in feats_count if feats_count[f] >= min_freq]))}
    rows, cols, data = [], [], []
    for i, feats in enumerate(feats_per_doc):
        for f, c in feats:
            if f in vocab:
                rows.append(i)
                cols.append(vocab[f])
                data.append(c)
    X = csr_matrix((data, (rows, cols)), shape=(len(tokens_list), len(vocab)), dtype=np.int64)
    return X, vocab

# Example: Vectorize first 3 training docs
feature_fns = [token_features, token_pair_features, lexicon_features]
tokens_list = [tokenize(d) for d in docs_train[:3]]
X_sample, vocab_sample = vectorize(tokens_list, feature_fns, min_freq=1)
print("Feature matrix shape:", X_sample.shape)
print("Vocabulary:", vocab_sample)
print("Feature matrix (dense):\n", X_sample.toarray())

## 6. Evaluate Feature Combinations with Cross-Validation
Test different feature settings and compute cross-validation accuracy for each combination using Logistic Regression.

In [None]:
def accuracy_score(truth, predicted):
    return np.mean(truth == predicted)

def cross_validation_accuracy(clf, X, labels, k):
    kf = KFold(n_splits=k, shuffle=False, random_state=42)
    accs = []
    for train_idx, test_idx in kf.split(X):
        clf.fit(X[train_idx], labels[train_idx])
        preds = clf.predict(X[test_idx])
        accs.append(accuracy_score(labels[test_idx], preds))
    return np.mean(accs)

def eval_all_combinations(docs, labels, punct_vals, feature_fns, min_freqs):
    results = []
    all_feature_combos = []
    for i in range(len(feature_fns)):
        for combo in combinations(feature_fns, i+1):
            all_feature_combos.append(list(combo))
    for punct in punct_vals:
        tokens_list = [tokenize(d, punct) for d in docs]
        for min_freq in min_freqs:
            for feats in all_feature_combos:
                X, vocab = vectorize(tokens_list, feats, min_freq)
                acc = cross_validation_accuracy(LogisticRegression(), X, labels, 5)
                results.append({'punct': punct, 'features': feats, 'min_freq': min_freq, 'accuracy': acc})
    return sorted(results, key=lambda x: x['accuracy'], reverse=True)

# Evaluate all combinations (may take a while on full data)
# results = eval_all_combinations(docs_train, labels_train, [True, False], feature_fns, [2, 5, 10])
# print('Best result:', results[0])
# print('Worst result:', results[-1])

## 7. Plot Sorted Cross-Validation Accuracies
Visualize the cross-validation accuracies for all feature settings using matplotlib.

In [None]:
def plot_sorted_accuracies(results):
    accuracies = sorted([r['accuracy'] for r in results])
    plt.figure(figsize=(8,4))
    plt.plot(accuracies)
    plt.xlabel('Setting')
    plt.ylabel('Accuracy')
    plt.title('Sorted Cross-Validation Accuracies')
    plt.tight_layout()
    plt.savefig('accuracies.png')
    plt.show()
    print('Plot saved as accuracies.png')

# Example usage (uncomment after running eval_all_combinations):
# plot_sorted_accuracies(results)

## 8. Analyze Mean Accuracy per Setting
Compute and display the mean accuracy for each model setting to understand their impact on performance.

In [None]:
def mean_accuracy_per_setting(results):
    setting_accs = defaultdict(list)
    for r in results:
        # Features
        feats = 'features=' + ' '.join(fn.__name__ for fn in r['features'])
        setting_accs[feats].append(r['accuracy'])
        # Punctuation
        setting_accs[f'punct={r["punct"]}'].append(r['accuracy'])
        # Min freq
        setting_accs[f'min_freq={r["min_freq"]}'].append(r['accuracy'])
    mean_acc = [(np.mean(v), k) for k, v in setting_accs.items()]
    return sorted(mean_acc, reverse=True)

# Example usage (uncomment after running eval_all_combinations):
# for acc, setting in mean_accuracy_per_setting(results):
#     print(f'{setting}: {acc:.5f}')

## 9. Train Best Classifier and Show Top Coefficients
Train a Logistic Regression classifier on the best feature settings and display the top positive and negative coefficients.

In [None]:
def fit_best_classifier(docs, labels, best_result):
    punct = best_result['punct']
    feature_fns = best_result['features']
    min_freq = best_result['min_freq']
    tokens_list = [tokenize(d, punct) for d in docs]
    X, vocab = vectorize(tokens_list, feature_fns, min_freq)
    clf = LogisticRegression().fit(X, labels)
    return clf, vocab

def top_coefs(clf, label, n, vocab):
    coef = clf.coef_[0]
    idx_to_feat = {v: k for k, v in vocab.items()}
    if label == 1:
        topn = np.argsort(coef)[-n:][::-1]
    else:
        topn = np.argsort(coef)[:n]
    return [(idx_to_feat[i], coef[i]) for i in topn]

# Example usage (uncomment after running eval_all_combinations):
# clf, vocab = fit_best_classifier(docs_train, labels_train, results[0])
# print('Top negative coefficients:')
# print(top_coefs(clf, 0, 5, vocab))
# print('Top positive coefficients:')
# print(top_coefs(clf, 1, 5, vocab))

## 10. Test Set Evaluation and Misclassification Analysis
Evaluate the trained classifier on the test set, show accuracy, classification report, confusion matrix, and print the top misclassified documents.

In [None]:
def parse_test_data(best_result, vocab):
    punct = best_result['punct']
    feature_fns = best_result['features']
    min_freq = best_result['min_freq']
    test_docs, test_labels = read_data(os.path.join('data', 'test'))
    tokens_list = [tokenize(d, punct) for d in test_docs]
    X_test, _ = vectorize(tokens_list, feature_fns, min_freq, vocab)
    return test_docs, test_labels, X_test

def print_top_misclassified(test_docs, test_labels, X_test, clf, n):
    probs = clf.predict_proba(X_test)
    preds = clf.predict(X_test)
    misclassified = []
    for i in range(len(test_labels)):
        if preds[i] != test_labels[i]:
            prob = probs[i][preds[i]]
            misclassified.append((test_labels[i], preds[i], prob, test_docs[i]))
    misclassified = sorted(misclassified, key=lambda x: -x[2])
    for i in misclassified[:n]:
        print(f"\nTruth={i[0]} Predicted={i[1]} Proba={i[2]:.6f}\n{i[3]}")

# Example usage (uncomment after fitting best classifier):
# test_docs, test_labels, X_test = parse_test_data(results[0], vocab)
# preds = clf.predict(X_test)
# print('Test accuracy:', accuracy_score(test_labels, preds))
# print(classification_report(test_labels, preds, digits=4))
# print('Confusion matrix:')
# print(confusion_matrix(test_labels, preds))
# print_top_misclassified(test_docs, test_labels, X_test, clf, 5)

## 11. TF-IDF and N-gram Feature Extraction with Multiple Models
Demonstrate feature extraction using TfidfVectorizer with different n-gram ranges and train multiple classifiers (Logistic Regression, SVM, Random Forest, Naive Bayes) with cross-validation.

In [None]:
def get_classifier(name):
    if name == 'logreg':
        return LogisticRegression(max_iter=200)
    elif name == 'svm':
        return LinearSVC(max_iter=2000)
    elif name == 'rf':
        return RandomForestClassifier(n_estimators=100)
    elif name == 'nb':
        return MultinomialNB()
    else:
        raise ValueError(f"Unknown classifier: {name}")

# Example: TF-IDF with unigrams and bigrams, multiple models
# ngram_range = (1,2)
# vectorizer = TfidfVectorizer(lowercase=True, ngram_range=ngram_range, stop_words='english')
# X = vectorizer.fit_transform(docs_train)
# models = ['logreg', 'svm', 'rf', 'nb']
# for model in models:
#     clf = get_classifier(model)
#     kf = KFold(n_splits=5, shuffle=False, random_state=42)
#     accs = []
#     for train_idx, test_idx in kf.split(X):
#         clf.fit(X[train_idx], labels_train[train_idx])
#         preds = clf.predict(X[test_idx])
#         accs.append(accuracy_score(labels_train[test_idx], preds))
#     print(f"{model} mean CV accuracy: {np.mean(accs):.4f}")

## 12. Feature Importance and ROC Curve Visualization
Plot feature importances for Logistic Regression and Random Forest models, and plot/save the ROC curve for classifiers that support probability estimates.

In [None]:
def plot_feature_importance(feature_names, importances, top_n=10, title='Feature Importance', filename='feature_importance.png'):
    importances = np.array(importances)
    indices = np.argsort(np.abs(importances))[-top_n:][::-1]
    plt.figure(figsize=(10, 5))
    plt.bar(range(top_n), importances[indices], align='center')
    plt.xticks(range(top_n), [feature_names[i] for i in indices], rotation=45, ha='right')
    plt.title(title)
    plt.tight_layout()
    plt.savefig(filename)
    plt.show()
    print(f'Feature importance plot saved as {filename}')

def plot_roc_curve(y_true, y_score, filename='roc_curve.png'):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.savefig(filename)
    plt.show()
    print(f'ROC curve saved as {filename}')

# Example usage (after fitting model):
# if hasattr(clf, 'coef_'):
#     plot_feature_importance(feature_names, clf.coef_[0])
# if hasattr(clf, 'feature_importances_'):
#     plot_feature_importance(feature_names, clf.feature_importances_)
# if hasattr(clf, 'predict_proba'):
#     probs = clf.predict_proba(X_test)[:, 1]
#     plot_roc_curve(labels_test, probs)