# Sentiment Classifier (Converted to Keras)
This notebook is a Keras-based version of the original PyTorch implementation.

In [None]:
# IO
import os
import csv
import pathlib
from pathlib import Path
import warnings
from tqdm.notebook import tqdm

# Utilities
import numpy as np 
import pandas as pd
import copy

# Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, StratifiedKFold

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Modeling and Training
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import (
    roc_curve, auc, roc_auc_score,
    accuracy_score, precision_score, recall_score, 
    f1_score, confusion_matrix
)

warnings.filterwarnings("ignore", category=FutureWarning)

## Data Loading and Preprocessing

In [None]:
# Example (adapt to your dataset)
df = pd.read_csv('your_dataset.csv')

# Basic preprocessing (tokenization, stopword removal, lemmatization)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words]
    return ' '.join(tokens)

df['processed_text'] = df['text_column'].apply(preprocess)  # replace 'text_column'

# Label encoding
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label_column'])  # replace 'label_column'

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(df['processed_text'], df['label_encoded'], stratify=df['label_encoded'], test_size=0.2, random_state=42)


## Tokenization and Vectorization

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

maxlen = 100
X_train_pad = keras.preprocessing.sequence.pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_val_pad = keras.preprocessing.sequence.pad_sequences(X_val_seq, maxlen=maxlen, padding='post')


## Model Definition (Keras)

In [None]:
model = keras.Sequential([
    layers.Embedding(input_dim=10000, output_dim=64, input_length=maxlen),
    layers.Bidirectional(layers.LSTM(64)),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()


## Training

In [None]:
history = model.fit(X_train_pad, y_train, 
                    validation_data=(X_val_pad, y_val), 
                    epochs=10, 
                    batch_size=32)


## Evaluation

In [None]:
y_pred_probs = model.predict(X_val_pad).flatten()
y_pred = (y_pred_probs > 0.5).astype(int)

print("Accuracy:", accuracy_score(y_val, y_pred))
print("Precision:", precision_score(y_val, y_pred))
print("Recall:", recall_score(y_val, y_pred))
print("F1 Score:", f1_score(y_val, y_pred))

fpr, tpr, _ = roc_curve(y_val, y_pred_probs)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()
