##**NLP Sentiment Analysis**

   ### Objective: Train → Save → Evaluate → Load → Predict

In [None]:
!pip install tensorflow nltk

Import libraries

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import string
import os
import pickle

    Downloads the tokenizer (punkt) and list of English stopwords.

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Download the missing resource

In [4]:
# 1. Sample raw text and labels
raw_sentences = [
    "I love machine learning!",
    "This is a great course.",
    "NLP is fascinating.",
    "I hate this subject.",
    "This is boring.",
    "I do not like winters."
]
labels = [1, 1, 1, 0, 0, 0]  # 1 = positive, 0 = negative

In [5]:
# 2. Preprocessing: lowercase, remove punctuation, remove stopwords
stop_words = set(stopwords.words('english'))

In [6]:
print(stop_words)

{"it'd", "isn't", 'him', 'its', 'under', 'but', 'do', "he'll", 'not', 'by', "she's", 'nor', "weren't", 'shan', "haven't", 'your', 'into', 't', 'up', 'has', "hasn't", 'm', 'being', 'over', 'should', "they've", 'no', 'very', 'myself', "should've", 'too', 'aren', 'before', 'me', 'few', 'hadn', 'shouldn', 'll', "shouldn't", 'this', 'a', 'when', 'through', 'above', 'it', "he'd", 'again', "don't", "aren't", 'herself', 'on', 'themselves', 'while', 'hasn', 'hers', 'isn', 'during', 'had', 'whom', "you've", "i'll", 'does', "it'll", "won't", 'own', 'he', "he's", 'so', "they'd", 'because', 'each', "shan't", 'at', 'these', "they're", 'where', 'if', 'what', 'is', "they'll", 'against', 'mightn', "you're", 'she', 'off', 'be', 'all', 'why', 'my', 'down', 'was', "i'd", 'am', 'both', "we're", 'them', 'don', 'they', 'doesn', 'about', 'haven', 'did', "that'll", 'his', "didn't", "you'd", "it's", "we'll", 're', 'to', "mightn't", "needn't", 'of', 'only', 'ma', 'most', 'some', 'just', 'once', 'in', 'have', 'o'

In [7]:
# 2. Preprocessing: lowercase, remove punctuation, remove stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(sentence):
    tokens = word_tokenize(sentence.lower())  # lowercase + tokenize
    tokens = [word for word in tokens if word.isalpha()]  # remove punctuation
    tokens = [word for word in tokens if word not in stop_words]  # remove stopwords
    return ' '.join(tokens)

cleaned_sentences = [preprocess_text(sent) for sent in raw_sentences]

In [8]:
print(cleaned_sentences)

['love machine learning', 'great course', 'nlp fascinating', 'hate subject', 'boring', 'like winters']


In [9]:
# Tokenize texts
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(cleaned_sentences)
sequences = tokenizer.texts_to_sequences(cleaned_sentences)
padded = pad_sequences(sequences, padding='post')



In [10]:
print("Word Index (Tokenized Vocabulary):")
print(tokenizer.word_index)

Word Index (Tokenized Vocabulary):
{'<OOV>': 1, 'love': 2, 'machine': 3, 'learning': 4, 'great': 5, 'course': 6, 'nlp': 7, 'fascinating': 8, 'hate': 9, 'subject': 10, 'boring': 11, 'like': 12, 'winters': 13}


In [11]:
print("\nTokenized Sentences (as sequences):")
for i, seq in enumerate(sequences):
    print(f"{raw_sentences[i]} → {seq}")


Tokenized Sentences (as sequences):
I love machine learning! → [2, 3, 4]
This is a great course. → [5, 6]
NLP is fascinating. → [7, 8]
I hate this subject. → [9, 10]
This is boring. → [11]
I do not like winters. → [12, 13]


In [12]:
# with open("tokenizer.pkl", "rb") as f:
#     tokenizer = pickle.load(f)
#     print(tokenizer)

In [13]:
# Build model with input_shape explicitly declared in build()
model = Sequential([
    Embedding(input_dim=100, output_dim=16, input_length=padded.shape[1]),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Explicitly build the model before summary
model.build(input_shape=(None, padded.shape[1]))

# Now summary will display shapes and parameters
model.summary()




In [14]:
# Compile and train
import numpy as np

# Convert labels list to numpy array
labels = np.array(labels)

# Compile and train
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(padded, labels, epochs=5, verbose=2)



Epoch 1/5
1/1 - 3s - 3s/step - accuracy: 0.6667 - loss: 0.6912
Epoch 2/5
1/1 - 0s - 141ms/step - accuracy: 0.8333 - loss: 0.6896
Epoch 3/5
1/1 - 0s - 90ms/step - accuracy: 0.8333 - loss: 0.6882
Epoch 4/5
1/1 - 0s - 128ms/step - accuracy: 0.8333 - loss: 0.6868
Epoch 5/5
1/1 - 0s - 74ms/step - accuracy: 0.8333 - loss: 0.6854


<keras.src.callbacks.history.History at 0x7f562248ee90>

In [15]:
# 9. Testing on new sentences
test_sentences = ["I love this subject!", "This is terrible."]
cleaned_test = [preprocess_text(sent) for sent in test_sentences]
test_seq = tokenizer.texts_to_sequences(cleaned_test)
test_pad = pad_sequences(test_seq, padding='post', maxlen=padded.shape[1])

predictions = model.predict(test_pad)

for i, sentence in enumerate(test_sentences):
    sentiment = "Positive" if predictions[i][0] > 0.5 else "Negative"
    print(f"'{sentence}' → {sentiment} ({predictions[i][0]:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step
'I love this subject!' → Positive (0.50)
'This is terrible.' → Negative (0.50)
