In [3]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
import re

# Task 1

In [4]:
# Generate synthetic IMDB comments for Good, Average, and Bad
good_comments = [f'This movie was fantastic! I loved it. {i}' for i in range(300)]
average_comments = [f'The movie was okay, not great but not bad either. {i}' for i in range(300)]
bad_comments = [f'I did not enjoy this movie at all. It was terrible. {i}' for i in range(300)]

comments = good_comments + average_comments + bad_comments
labels = ['Good'] * 300 + ['Average'] * 300 + ['Bad'] * 300

# Shuffle the data
combined = list(zip(comments, labels))
random.shuffle(combined)
comments, labels = zip(*combined)

# Create DataFrame
df = pd.DataFrame({'comment': comments, 'label': labels})
df.head()

Unnamed: 0,comment,label
0,I did not enjoy this movie at all. It was terr...,Bad
1,"The movie was okay, not great but not bad eith...",Average
2,"The movie was okay, not great but not bad eith...",Average
3,I did not enjoy this movie at all. It was terr...,Bad
4,I did not enjoy this movie at all. It was terr...,Bad


# Task 2 and 3

In [5]:

def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply preprocessing
df['clean_comment'] = df['comment'].apply(preprocess_text)

# Encode labels
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])  # Good=2, Average=0, Bad=1 (order may vary)

# One-hot encode the labels for neural network
y = to_categorical(df['label_encoded'])

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['clean_comment']).toarray()

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Build the model
model = Sequential()
model.add(Dense(128, input_dim=X.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='softmax'))  # 3 classes

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model for at least 100 epochs
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1, verbose=1)

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.8096 - loss: 0.9607 - val_accuracy: 1.0000 - val_loss: 0.6303
Epoch 2/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.5073 - val_accuracy: 1.0000 - val_loss: 0.2226
Epoch 3/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.1536 - val_accuracy: 1.0000 - val_loss: 0.0528
Epoch 4/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0398 - val_accuracy: 1.0000 - val_loss: 0.0191
Epoch 5/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0161 - val_accuracy: 1.0000 - val_loss: 0.0102
Epoch 6/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0090 - val_accuracy: 1.0000 - val_loss: 0.0065
Epoch 7/100
[1m23/23[0m [32m━━━━━━━━━━━━━━

# Task 4

In [8]:

num_samples = 10
indices = np.random.choice(X_test.shape[0], num_samples, replace=False)
X_sample = X_test[indices]
y_true = y_test[indices]

# Predict
y_pred_probs = model.predict(X_sample, verbose=0)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true_labels = np.argmax(y_true, axis=1)

# Decode labels back to original
predicted_labels = le.inverse_transform(y_pred)
true_labels = le.inverse_transform(y_true_labels)

for i in range(num_samples):
    print(f"Example {i+1}:")
    print(f"Predicted: {predicted_labels[i]}, Actual: {true_labels[i]}")
    print("---")

Example 1:
Predicted: Bad, Actual: Bad
---
Example 2:
Predicted: Average, Actual: Average
---
Example 3:
Predicted: Good, Actual: Good
---
Example 4:
Predicted: Average, Actual: Average
---
Example 5:
Predicted: Good, Actual: Good
---
Example 6:
Predicted: Good, Actual: Good
---
Example 7:
Predicted: Good, Actual: Good
---
Example 8:
Predicted: Average, Actual: Average
---
Example 9:
Predicted: Bad, Actual: Bad
---
Example 10:
Predicted: Good, Actual: Good
---
