<a href="https://colab.research.google.com/github/mayssamhira/Deep-Learning-and-Neural-Networks-project/blob/main/Deep_Learning_and_Neural_Networks_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install packages needed for this project
!pip install -q pandas scikit-learn tensorflow nltk matplotlib seaborn wordcloud

**Imports + check GPU**

In [2]:
import os, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, roc_curve

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# Check for GPU
print("GPU available:", "Yes" if tf.test.is_gpu_available() else "No")
print("TensorFlow version:", tf.__version__)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


GPU available: No
TensorFlow version: 2.19.0


**Upload the dataset**

In [None]:
from google.colab import files
uploaded = files.upload()  # Upload 'IMDB Dataset.csv' from your computer

DATA_PATH = "/content/IMDB Dataset.csv"


**Load dataset and explore**

In [None]:
df = pd.read_csv(DATA_PATH)
print("Dataset shape:", df.shape)
display(df.head())

# Check sentiment distribution
print(df['sentiment'].value_counts())

# Add review length for visualization
df['review_length_words'] = df['review'].apply(lambda x: len(str(x).split()))
display(df['review_length_words'].describe())


**NLTK setup + preprocessing functions**

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', ' ', text)  # remove HTML
    text = re.sub(r'http\S+|www\.\S+', ' ', text)  # remove URLs
    text = re.sub(r'[^a-z0-9\s]', ' ', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered = [t for t in tokens if t not in STOPWORDS and len(t) > 1]
    return " ".join(filtered)


**Apply preprocessing**

In [None]:
# Clean text
df['cleaned'] = df['review'].apply(clean_text)
# Tokenize and remove stopwords
df['cleaned'] = df['cleaned'].apply(tokenize_and_remove_stopwords)

# Quick check
display(df[['review', 'cleaned']].sample(3))


**Visualize sentiment distribution and review lengths**

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='sentiment', data=df)
plt.title("Sentiment Distribution")
plt.show()

plt.figure(figsize=(6,4))
sns.histplot(df['review_length_words'], bins=60)
plt.title("Review Length Distribution")
plt.show()


**TF-IDF vectorization**

In [None]:
MAX_FEATURES = 20000
vectorizer = TfidfVectorizer(max_features=MAX_FEATURES, ngram_range=(1,2))
X = vectorizer.fit_transform(df['cleaned'])

# Encode labels
le = LabelEncoder()
y = le.fit_transform(df['sentiment'])  # negative=0, positive=1

print("TF-IDF matrix shape:", X.shape)
print("Classes:", le.classes_)


**Train/Test split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=42
)

# Convert sparse matrix to dense for Keras
X_train = X_train.toarray().astype('float32')
X_test = X_test.toarray().astype('float32')
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


**Build the neural network**

In [None]:
input_dim = X_train.shape[1]

model = Sequential([
    Dense(512, activation='relu', input_shape=(input_dim,)),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


**Callbacks for training**

In [None]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6, verbose=1)
]


**Train the model**

In [None]:
history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=10,
    batch_size=128,
    callbacks=callbacks,
    verbose=2
)


**Evaluate the model**

In [None]:
# Test set evaluation
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Test loss: {loss:.4f}, Test accuracy: {acc:.4f}")

# Predictions
y_probs = model.predict(X_test).ravel()
y_pred = (y_probs >= 0.5).astype(int)

print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted"); plt.ylabel("Actual"); plt.show()

# ROC Curve
auc = roc_auc_score(y_test, y_probs)
fpr, tpr, _ = roc_curve(y_test, y_probs)
plt.plot(fpr, tpr, label=f"AUC = {auc:.4f}")
plt.plot([0,1],[0,1],'--', alpha=0.5)
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate"); plt.title("ROC Curve"); plt.legend(); plt.show()


**Predict on a new review**

In [None]:
def predict_review(text):
    cleaned = tokenize_and_remove_stopwords(clean_text(text))
    vect = vectorizer.transform([cleaned]).toarray().astype('float32')
    prob = model.predict(vect)[0,0]
    label = 'positive' if prob >= 0.5 else 'negative'
    return label, prob

sample = "The movie was thrilling and exciting, I loved it!"
label, prob = predict_review(sample)
print(f"Review: {sample}\nPredicted sentiment: {label} (probability={prob:.4f})")
