<a href="https://colab.research.google.com/github/madhura2024/deep_learning/blob/main/Copy_of_sentiment_Text_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import re
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam


read = pd.read_csv('Airline-Sentiment-2-w-AA.csv')

read = read[['text', 'airline_sentiment']]
read = read[(read['airline_sentiment'] == 'positive') | (read['airline_sentiment'] == 'negative')]
read['label'] = read['airline_sentiment'].map({'positive': 1, 'negative': 0})


corpus = read['text'].values
labels = read['label'].values


wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

cleaned_corpus = []

for sentence in corpus:
    sentence = sentence.lower()
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    sentence = re.sub('\\s+', ' ', sentence).strip()

    words = sentence.split()
    filtered_words = [
        wordnet_lemmatizer.lemmatize(w)
        for w in words
        if w not in stop_words
    ]

    cleaned_corpus.append(' '.join(filtered_words))


tokenizer = Tokenizer()
tokenizer.fit_on_texts(cleaned_corpus)

sequences = tokenizer.texts_to_sequences(cleaned_corpus)
X = pad_sequences(sequences, maxlen=50, padding='pre')
y = np.array(labels).astype('float32')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1,output_dim=50,input_length=max_len))
model.add(LSTM(64))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(4, activation='relu'))
model.add(Dense(1, activation='sigmoid'))





model.compile(optimizer=Adam(0.001),loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32)

loss, accuracy = model.evaluate(X_test, y_test)
# print(f"Test Loss: {loss:.4f}")
# print(f"Test Accuracy: {accuracy:.4f}")


def predict_sentiment(msg):
    msg = msg.lower()
    msg = re.sub('[^a-zA-Z]', ' ', msg)
    msg = re.sub('\\s+', ' ', msg).strip()

    words = [
        wordnet_lemmatizer.lemmatize(w)
        for w in msg.split()
        if w not in stop_words
    ]

    cleaned_msg = ' '.join(words)

    seq = tokenizer.texts_to_sequences([cleaned_msg])
    padded_seq = pad_sequences(seq, maxlen=max_len, padding='pre')

    pred = model.predict(padded_seq)[0][0]

    print("\nMessage:", msg)
    print("Sentiment:", "Positive" if pred > 0.5 else "Negative")



predict_sentiment("The flight was delayed and staff were rude")
predict_sentiment("Amazing service and friendly crew")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/10




[1m289/289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 37ms/step - accuracy: 0.8050 - loss: 0.4750
Epoch 2/10
[1m289/289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 35ms/step - accuracy: 0.9460 - loss: 0.1553
Epoch 3/10
[1m289/289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 36ms/step - accuracy: 0.9745 - loss: 0.0810
Epoch 4/10
[1m289/289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 37ms/step - accuracy: 0.9850 - loss: 0.0536
Epoch 5/10
[1m289/289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 34ms/step - accuracy: 0.9892 - loss: 0.0407
Epoch 6/10
[1m289/289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 37ms/step - accuracy: 0.9895 - loss: 0.0371
Epoch 7/10
[1m289/289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 36ms/step - accuracy: 0.9939 - loss: 0.0227
Epoch 8/10
[1m289/289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 36ms/step - accuracy: 0.9946 - loss: 0.0182
Epoch 9/10
[1m289/289[0m [32m━━━

Why multiple stemmers were removed

Porter, Snowball, and Regexp stemmers all do the same job.

Using many stemmers makes code long and confusing.

Lemmatization alone is enough for sentiment analysis.

So extra stemmers were removed.

2. Why Named Entity Recognition (NER) was removed

NER finds names of people, places, or organizations.

Sentiment analysis only needs positive or negative feeling.


3. Why Bag-of-Words (BoW) was removed

BoW counts words but does not keep word order.

LSTM needs word order to understand meaning.

Example:
“not good” ≠ “good not”
But BoW treats them as same.

So BoW is not suitable for LSTM.

BoW is better for classical models, not sequence models.

4. Why TF-IDF and ANN were removed

TF-IDF produces fixed-length vectors.

ANN works on fixed-length input, not sequences.

LSTM works on sequences, not vectors.

Mixing ANN + LSTM in one project causes confusion.

So TF-IDF and ANN were removed.

5. Why Word2Vec was removed

Keras Embedding layer already learns word meanings.

Word2Vec is optional, not compulsory.

Using both increases complexity.

So Word2Vec was removed.

6. Why only tokenization + padding is used

LSTM needs text in sequence form.

Tokenizer converts words into numbers.

Padding makes all sequences same length.

This is the correct input format for LSTM.

7. Why Embedding layer is used

Embedding layer converts word numbers into vectors.

It learns word meaning during training.

No need for external embeddings.

This works best with LSTM.

8. Why many-to-one LSTM is used

Input: many words (sequence)

Output: one label (positive or negative)

This matches sentiment analysis perfectly.

9. BoW + LSTM — why it is a bad idea

BoW removes word order.

LSTM needs word order.

Combining both defeats LSTM’s purpose.

Technically possible but not meaningful.

10. Correct approach followed in this project

Clean text

Remove stopwords

Lemmatize words

Convert text to sequences

Pad sequences

Use Embedding + LSTM + Sigmoid output

Predict Positive or Negative sentiment

11. Final conclusion (1-line)

BoW is for classical models, LSTM needs sequences, so tokenized sequences with embeddings are used instead of BoW.

In [None]:
import nltk
nltk.download('punkt')

import re
import numpy as np
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

with open("alice.txt", "r", encoding="utf-8") as f:
    text = f.read()

text = text.lower()
text = re.sub('[^a-zA-Z ]', ' ', text)
text = re.sub('\s+', ' ', text).strip()

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

total_words = len(tokenizer.word_index) + 1

words = tokenizer.texts_to_sequences([text])[0]
words = np.array(words)


generator = TimeseriesGenerator(data=words,targets=words,length=5,batch_size=128)



model = Sequential()
model.add(Embedding(total_words, 100, input_length=seq_length))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])
model.fit(generator, epochs=5)




def generate_text(seed_text, next_words, temperature=1.0):
    for _ in range(next_words):
        seq = tokenizer.texts_to_sequences([seed_text])[0]
        seq = pad_sequences([seq], maxlen=seq_length, padding='pre')

        preds = model.predict(seq, verbose=0)[0]
        preds = np.log(preds + 1e-8) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)

        next_index = np.random.choice(len(preds), p=preds)
        next_word = tokenizer.index_word.get(next_index, '')

        seed_text += " " + next_word

    return seed_text

print(generate_text("alice was beginning", 20, temperature=0.7))
print(generate_text("alice was beginning", 20, temperature=1.0))
print(generate_text("alice was beginning", 20, temperature=1.2))


Epoch 1/5


  text = re.sub('\s+', ' ', text).strip()
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  self._warn_if_super_not_called()


[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 39ms/step - accuracy: 0.0538 - loss: 6.8646
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 43ms/step - accuracy: 0.0588 - loss: 5.9132
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 43ms/step - accuracy: 0.0616 - loss: 5.8189
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 37ms/step - accuracy: 0.0727 - loss: 5.6294
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 55ms/step - accuracy: 0.0928 - loss: 5.4490
alice was beginning and the minute in the had jury but the white hare it of the caterpillar as the queen only the
alice was beginning to be conduct fetch alice could said the from you long way said the house about surprised but i know
alice was beginning as they see to see her at one on run the good asking nor interrupt before all begins it s
