# Experiment Template

**Team Member:** [Your Name]  
**Model:** [e.g., RNN, LSTM, GRU, Logistic Regression]  
**Embedding:** [e.g., TF-IDF, Word2Vec (Skip-gram), GloVe]

---

## 1. Setup & Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import os
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

# Set plot style
sns.set(style="whitegrid")

## 2. Load Data

In [None]:
DATA_PATH = '../data/IMDB Dataset.csv'
if not os.path.exists(DATA_PATH):
    print(f"Warning: {DATA_PATH} not found. Please ensure the dataset is in the data folder.")
else:
    df = pd.read_csv(DATA_PATH)
    print(f"Dataset loaded successfully: {df.shape}")
    display(df.head())

## 3. Exploratory Data Analysis (EDA)
Understanding the dataset characteristics.

In [None]:
if 'sentiment' in df.columns:
    # Class Distribution
    plt.figure(figsize=(6, 4))
    sns.countplot(x='sentiment', data=df)
    plt.title('Class Distribution')
    plt.show()
    print(df['sentiment'].value_counts())

In [None]:
# Review Length Analysis
df['word_count'] = df['review'].apply(lambda x: len(x.split()))

plt.figure(figsize=(10, 6))
sns.histplot(df['word_count'], bins=50, kde=True)
plt.title('Review Length Distribution (Word Count)')
plt.xlabel('Number of Words')
plt.show()

print(f"Average word count: {df['word_count'].mean():.2f}")

## 4. Preprocessing
Applying shared preprocessing strategy:
1. Lowercase
2. Remove HTML tags
3. Remove special characters
4. Tokenize
5. Remove stopwords (Optional based on embedding)

In [None]:
def clean_text(text, remove_stopwords=True):
    # 1. Lowercase
    text = text.lower()
    # 2. HTML tag removal
    text = re.sub(r'<.*?>', '', text)
    # 3. Remove non-alphanumeric characters (preserving spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # 4. Tokenize
    tokens = word_tokenize(text)
    
    # 5. Remove stopwords (Optional based on embedding)
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [w for w in tokens if w not in stop_words]
        return " ".join(filtered_tokens)
    else:
        return " ".join(tokens)

# Apply cleaning (Default strategy)
print("Preprocessing data (this may take a moment)...")
df['cleaned_review'] = df['review'].apply(lambda x: clean_text(x, remove_stopwords=True))
display(df[['review', 'cleaned_review']].head())

### Post-Preprocessing Visualization

In [None]:
# Word Cloud for cleaned text
all_text = " ".join(df['cleaned_review'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud (Cleaned Data)')
plt.show()

In [None]:
# Train/Test Split
X = df['cleaned_review'].values
y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values # Binary encoding

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

## 5. Embedding Layer
Implement your specific embedding here (TF-IDF, Word2Vec, GloVe, etc.).

In [None]:
# TODO: Implement Embedding
# Example for Word2Vec/Glove: Prepare Tokenizer and Embedding Matrix
# Example for TF-IDF: Use TfidfVectorizer

pass

## 6. Model Architecture
Define your model (RNN, LSTM, GRU, or Traditional ML).

In [None]:
# TODO: Define Model
model = None

## 7. Training

In [None]:
# TODO: Train model
# history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5)

## 8. Evaluation

In [None]:
# TODO: Evaluate
# y_pred = model.predict(X_test)
# y_pred_class = (y_pred > 0.5).astype(int)

# print("Accuracy:", accuracy_score(y_test, y_pred_class))
# print(classification_report(y_test, y_pred_class))

# Confusion Matrix
# cm = confusion_matrix(y_test, y_pred_class)
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.show()

## 9. Save Results

In [None]:
# results = {
#     'model': 'MyModel',
#     'embedding': 'MyEmbedding',
#     'accuracy': 0.0
# }
# result_df = pd.DataFrame([results])
# result_df.to_csv('../results/experiment_results.csv', mode='a', header=not os.path.exists('../results/experiment_results.csv'), index=False)