<a href="https://colab.research.google.com/github/piyushjaangid/ImageEditWebApp/blob/main/Travel_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import libraries for NLP and text preprocessing
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Import machine learning models
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.cluster import KMeans

# Import deep learning models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Import BERT
from transformers import BertTokenizer, TFBertForSequenceClassification

# -----------------------------------
# Load and explore the dataset
# -----------------------------------

# Replace this with your dataset path or link
# Example: Social media posts, traveler feedback, and sentiments labeled as Positive, Neutral, or Negative
data = pd.read_csv('traveler_sentiment_data.csv')

# Print the first few rows of the dataset to understand its structure
print(data.head())

# -----------------------------------
# Preprocessing the text data
# -----------------------------------

# Download stopwords from NLTK
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Define a function to clean and preprocess text data
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # Remove special characters and numbers
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])

    return text

# Apply the preprocessing function to the text data
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Encode the sentiment labels (Positive = 2, Neutral = 1, Negative = 0)
label_encoder = LabelEncoder()
data['sentiment_encoded'] = label_encoder.fit_transform(data['sentiment'])

# -----------------------------------
# Text Vectorization: Bag of Words or TF-IDF
# -----------------------------------

# Option 1: Bag of Words (BoW) using CountVectorizer
vectorizer = CountVectorizer(max_features=5000)
X_bow = vectorizer.fit_transform(data['cleaned_text']).toarray()

# Option 2: TF-IDF Vectorizer (for text importance weighting)
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(data['cleaned_text']).toarray()

# Define target variable
y = data['sentiment_encoded']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

# -----------------------------------
# Model 1: Naive Bayes for Sentiment Classification
# -----------------------------------

# Initialize the Naive Bayes model
nb = MultinomialNB()

# Train the Naive Bayes model
nb.fit(X_train, y_train)

# Predict on the test set
y_pred_nb = nb.predict(X_test)

# Evaluate the Naive Bayes model performance
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))

# Confusion Matrix
conf_matrix_nb = confusion_matrix(y_test, y_pred_nb)
sns.heatmap(conf_matrix_nb, annot=True, fmt='d', cmap='Blues')
plt.title('Naive Bayes Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# -----------------------------------
# Model 2: LSTM for Sentiment Classification
# -----------------------------------

# Tokenization for LSTM (convert text to sequences)
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data['cleaned_text'])
X_lstm = tokenizer.texts_to_sequences(data['cleaned_text'])

# Padding the sequences to ensure uniform input length
X_lstm = pad_sequences(X_lstm, maxlen=200)  # Adjust maxlen as needed

# Split the padded sequences into training and testing sets
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_lstm, y, test_size=0.2, random_state=42)

# Build the LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
lstm_model.add(Bidirectional(LSTM(128, return_sequences=False)))
lstm_model.add(Dense(3, activation='softmax'))  # Output layer with 3 classes (Positive, Neutral, Negative)

# Compile the model
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the LSTM model
lstm_model.fit(X_train_lstm, y_train_lstm, epochs=5, batch_size=64, verbose=1)

# Evaluate the LSTM model on the test set
y_pred_lstm = np.argmax(lstm_model.predict(X_test_lstm), axis=-1)

# Print LSTM accuracy and classification report
print("LSTM Accuracy:", accuracy_score(y_test_lstm, y_pred_lstm))
print("Classification Report:\n", classification_report(y_test_lstm, y_pred_lstm))

# -----------------------------------
# Model 3: BERT for Sentiment Classification
# -----------------------------------

# Load pre-trained BERT model and tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Encode the text data for BERT input
def encode_for_bert(texts):
    return bert_tokenizer(
        texts.tolist(),
        add_special_tokens=True,
        max_length=200,
        padding='max_length',
        return_attention_mask=True,
        truncation=True,
        return_tensors='tf'
    )

# Prepare the training and test sets for BERT
train_encoded = encode_for_bert(data['cleaned_text'].iloc[X_train.index])
test_encoded = encode_for_bert(data['cleaned_text'].iloc[X_test.index])

# Train the BERT model
bert_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
bert_model.fit(
    x={'input_ids': train_encoded['input_ids'], 'attention_mask': train_encoded['attention_mask']},
    y=y_train,
    epochs=3,
    batch_size=32,
    validation_data=({'input_ids': test_encoded['input_ids'], 'attention_mask': test_encoded['attention_mask']}, y_test)
)

# Evaluate the BERT model
bert_eval = bert_model.evaluate({'input_ids': test_encoded['input_ids'], 'attention_mask': test_encoded['attention_mask']}, y_test)
print("BERT Accuracy:", bert_eval[1])

# -----------------------------------
# K-Means Clustering for Unsupervised Learning
# -----------------------------------

# Perform K-Means clustering on the TF-IDF representation of text
kmeans = KMeans(n_clusters=3, random_state=42)  # Assume 3 clusters: Positive, Neutral, Negative
kmeans.fit(X_tfidf)

# Predict cluster labels
clusters = kmeans.predict(X_tfidf)

# Visualize the distribution of clusters
sns.countplot(clusters)
plt.title("Cluster Distribution (K-Means)")
plt.xlabel("Cluster")
plt.ylabel("Count")
plt.show()

