In [None]:
from google.colab import files
uploaded = files.upload()


Saving bangladesh_coup_sentiment_analysis.csv to bangladesh_coup_sentiment_analysis.csv


In [None]:
import pandas as pd

# Load dataset
data = pd.read_csv('bangladesh_coup_sentiment_analysis.csv')


In [None]:
data.head()


Unnamed: 0,Text,Label
0,"The military crackdown has intensified, with r...",1
1,International observers commend the public for...,2
2,Reports indicate widespread demonstrations acr...,0
3,Human rights organizations have condemned the ...,1
4,The interim government promises reforms to add...,2


In [None]:
import pandas as pd
import re
import nltk

# Load dataset
data = pd.read_csv('bangladesh_coup_sentiment_analysis.csv')

# 1. Remove unwanted characters and symbols
# Using regular expressions to remove any character that is not a letter, number, or whitespace.
data['Text'] = data['Text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

# 2. Convert text to lowercase
# Standardizing text to lowercase to maintain consistency in analysis.
data['Text'] = data['Text'].str.lower()

# 3. Remove stop words (optional)
# Stop words are common words that don’t contribute much to the sentiment, so we filter them out.
nltk.download('stopwords')  # Download stop words once
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Remove stop words from each text entry
data['Text'] = data['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# 5. Check for null or empty text rows and remove them
# Dropping rows where the Text column is null or contains only whitespace.
data = data.dropna(subset=['Text'])          # Remove rows with missing values in 'Text'
data = data[data['Text'].str.strip() != '']  # Remove rows where 'Text' is just empty/whitespace

# Print the first few rows to verify cleaning steps
print(data.head())


                                                Text  Label
0  military crackdown intensified reports severe ...      1
1  international observers commend public peacefu...      2
2  reports indicate widespread demonstrations acr...      0
3  human rights organizations condemned use exces...      1
4  interim government promises reforms address pu...      2


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from sklearn.model_selection import train_test_split

# Define the split sizes
train_size = 0.7  # 70% of data for training
val_size = 0.15   # 15% of data for validation
test_size = 0.15  # 15% of data for testing

# First, split the data into training and temp (validation + test) sets
train_data, temp_data = train_test_split(data, test_size=(val_size + test_size), random_state=42, stratify=data['Label'])

# Next, split the temp set into validation and test sets
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data['Label'])

# Display the sizes of each set to verify the split
print("Training set size:", train_data.shape)
print("Validation set size:", val_data.shape)
print("Test set size:", test_data.shape)


Training set size: (700, 2)
Validation set size: (150, 2)
Test set size: (150, 2)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM

# Parameters
vocab_size = 5000  # Set vocabulary size
embedding_dim = 16
max_length = 100  # Maximum length of each text sequence
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

# Tokenize the text data
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_data['Text'])

# Convert text to sequences and pad them
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['Text']), maxlen=max_length, padding=padding_type, truncating=trunc_type)
X_val = pad_sequences(tokenizer.texts_to_sequences(val_data['Text']), maxlen=max_length, padding=padding_type, truncating=trunc_type)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['Text']), maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Convert labels to categorical format
y_train = tf.keras.utils.to_categorical(train_data['Label'], num_classes=3)
y_val = tf.keras.utils.to_categorical(val_data['Label'], num_classes=3)
y_test = tf.keras.utils.to_categorical(test_data['Label'], num_classes=3)

# Define a more complex model with LSTM
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),  # Embedding layer
    LSTM(64, return_sequences=True),                                # LSTM layer with 64 units
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(3, activation='softmax')                                  # Output layer with 3 classes
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Force the model to build by specifying an input shape for building
model.build(input_shape=(None, max_length))  # (batch_size, sequence_length)

# Display the model summary
model.summary()


In [None]:
# Set training parameters
epochs = 100  # Increased number of epochs for better learning
batch_size = 32

# Train the model
history = model.fit(
    X_train, y_train,               # Training data
    epochs=epochs,                  # Number of epochs
    batch_size=batch_size,          # Batch size
    validation_data=(X_val, y_val), # Validation data
    verbose=1                       # Print progress
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Accuracy: {test_accuracy:.2f}")


Epoch 1/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - accuracy: 0.6989 - loss: 0.4590 - val_accuracy: 0.7733 - val_loss: 0.4337
Epoch 2/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.7711 - loss: 0.4271 - val_accuracy: 1.0000 - val_loss: 0.3850
Epoch 3/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.9793 - loss: 0.2352 - val_accuracy: 1.0000 - val_loss: 0.0273
Epoch 4/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 1.0000 - loss: 0.0194 - val_accuracy: 1.0000 - val_loss: 0.0044
Epoch 5/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - accuracy: 1.0000 - loss: 0.0037 - val_accuracy: 1.0000 - val_loss: 0.0020
Epoch 6/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - accuracy: 1.0000 - loss: 0.0017 - val_accuracy: 1.0000 - val_loss: 0.0012
Epoch 7/100
[1m22/22[0m [

In [None]:
# Save the model to an .h5 file
model.save('sentiment_model.h5')




In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load the saved model
model = tf.keras.models.load_model('sentiment_model.h5')

# Define tokenizer settings (must match the settings used during training)
vocab_size = 5000
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

# Initialize tokenizer (use the actual vocabulary if available)
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts([])  # Replace with actual vocabulary if available

# Function to predict sentiment from user input
def predict_sentiment():
    # Prompt user for input
    text = input("Enter a sentence to analyze its sentiment: ")

    # Preprocess the input text
    sequences = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

    # Get prediction
    prediction = model.predict(padded)
    sentiment = np.argmax(prediction, axis=1)[0]

    # Map prediction to sentiment label
    sentiment_label = {0: "Neutral", 1: "Negative", 2: "Positive"}
    print(f"Sentiment: {sentiment_label[sentiment]}")

# Call the function to predict sentiment based on user input
predict_sentiment()

#sample text
#The situation is getting worse, and people are feeling hopeless p
#I'm disappointed with how the authorities are handling this n




Enter a sentence to analyze its sentiment: People are gathering in large numbers to observe the current events
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 308ms/step
Sentiment: Negative
