# 05 - Feature Engineering

This notebook prepares data for model training.

## Imports

In [4]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from pathlib import Path
import joblib
import matplotlib.pyplot as plt


## Ensure directories exist and Load Dataset

In [5]:
os.makedirs("../models/processed", exist_ok=True)
os.makedirs("../assets", exist_ok=True)

# Path
CLEANED_OUTPUT_LABELED_PATH = Path("../data/netflix_reviews_Cleaned.csv")

# Load
df = pd.read_csv(CLEANED_OUTPUT_LABELED_PATH)
print(f"✅ Loaded Cleaned_labeled dataset with {len(df)} rows")



✅ Loaded Cleaned_labeled dataset with 41238 rows


## Encode target sentiment

In [6]:
le = LabelEncoder()
df['sentiment_encoded'] = le.fit_transform(df['sentiment_combined'])

## Save label encoder

In [8]:
joblib.dump(le, "../models/processed/label_encoder.joblib")

['../models/processed/label_encoder.joblib']

## Split dataset

In [9]:
X = df['clean_review']
y = df['sentiment_encoded']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


## TF-IDF Vectorization

In [12]:
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

## Save TF-IDF vectorizer

In [13]:
joblib.dump(tfidf, "../models/processed/tfidf_vectorizer.joblib")

['../models/processed/tfidf_vectorizer.joblib']

### Feature Engineering Completed!

➡️ Next Notebook: **06_model_training.ipynb**

