# MovieSent – EDA & Prototyping

This notebook explores the dataset, demonstrates preprocessing, and prototypes features for both models.


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.utils.text_preprocess import load_reviews_csv, prepare_dataset, preprocess_text


In [None]:
# Load dataset
DATA_PATH = os.path.join(os.getcwd(), 'IMDB Dataset.csv')
df_raw = load_reviews_csv(DATA_PATH)
df = prepare_dataset(df_raw)
df.head()


In [None]:
# Basic info
print(df.shape)
df.isna().sum()


In [None]:
# Class balance
sns.countplot(x=df['label'])
plt.title('Label Distribution (after mapping)')
plt.show()


In [None]:
# Preprocessing demo
sample = df['review'].iloc[0]
print('Original:', sample[:400])
print('\nProcessed:', preprocess_text(sample)[:400])


In [None]:
# TF-IDF baseline quick check
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

bin_df = df[df['label'].isin([0,1])]
X = bin_df['clean_review'].values
y = bin_df['label'].astype(int).values
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
vec = TfidfVectorizer(ngram_range=(1,2), max_features=50000, min_df=2)
Xtrv = vec.fit_transform(Xtr)
Xtev = vec.transform(Xte)
clf = LogisticRegression(max_iter=500)
clf.fit(Xtrv, ytr)
print(classification_report(yte, clf.predict(Xtev)))


In [None]:
# Tokenizer setup demo (LSTM)
from tensorflow import keras
MAX_VOCAB = 20000
MAX_LEN = 200
texts = bin_df['clean_review'].astype(str).tolist()

tok = keras.preprocessing.text.Tokenizer(num_words=MAX_VOCAB, oov_token='<OOV>')
tok.fit_on_texts(texts)
seq = tok.texts_to_sequences(texts[:5])
pad = keras.preprocessing.sequence.pad_sequences(seq, maxlen=MAX_LEN)
pad.shape
