In [None]:
import pandas as pd

# Load the files
file_path_animal = '/content/Animal_movie.csv'
file_path_sambahadur = '/content/Sambahadur_movie.csv'

# Reading the CSV files
animal_data = pd.read_csv(file_path_animal)
sambahadur_data = pd.read_csv(file_path_sambahadur)

# Displaying the first few rows of each file to understand their structure
animal_data_head = animal_data.head()
sambahadur_data_head = sambahadur_data.head()

animal_data_head["Tweets"][3]

In [None]:
from textblob import TextBlob
import numpy as np

def analyze_sentiment(df):
    """ Function to analyze the sentiment of tweets in a dataframe """
    # Applying TextBlob sentiment analysis
    df['Polarity'] = df['Tweets'].apply(lambda x: TextBlob(x).sentiment.polarity)
    df['Sentiment'] = np.select(
        [
            df['Polarity'] > 0,
            df['Polarity'] == 0,
            df['Polarity'] < 0
        ],
        [
            'Positive',
            'Neutral',
            'Negative'
        ]
    )

    # Summary of sentiments
    sentiment_summary = df['Sentiment'].value_counts(normalize=True) * 100
    return sentiment_summary

# Analyze sentiment for both datasets
sentiment_summary_animal = analyze_sentiment(animal_data)
sentiment_summary_sambahadur = analyze_sentiment(sambahadur_data)

sentiment_summary_animal, sentiment_summary_sambahadur


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import itertools

def extract_keywords(df):
    """ Function to extract keywords from tweets """
    # Using CountVectorizer to extract keywords
    vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2), max_features=10)
    counts = vectorizer.fit_transform(df['Tweets'])
    keywords = vectorizer.get_feature_names_out()

    # Count the occurrences of each keyword
    counts_sum = counts.sum(axis=0)
    keywords_freq = [(word, counts_sum[0, idx]) for word, idx in zip(keywords, range(counts_sum.shape[1]))]

    # Sort keywords based on frequency
    sorted_keywords = sorted(keywords_freq, key=lambda x: x[1], reverse=True)
    return sorted_keywords

# Extract keywords for both datasets
keywords_animal = extract_keywords(animal_data)
keywords_sambahadur = extract_keywords(sambahadur_data)

keywords_animal, keywords_sambahadur


In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

def perform_topic_modeling(df):
    """ Function to perform topic modeling on tweets """
    # Using TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(df['Tweets'])

    # Using LDA for topic modeling
    lda = LatentDirichletAllocation(n_components=5, random_state=0)
    lda.fit(tfidf)

    # Extracting the topics
    feature_names = tfidf_vectorizer.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(lda.components_):
        topic_keywords = [feature_names[i] for i in topic.argsort()[:-11:-1]]
        topics.append((f"Topic {topic_idx+1}", topic_keywords))

    return topics

# Perform topic modeling for both datasets
topics_animal = perform_topic_modeling(animal_data)
topics_sambahadur = perform_topic_modeling(sambahadur_data)

topics_animal, topics_sambahadur


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def summarize_text(df, n_components=1):
    """ Summarize text using LSA (Latent Semantic Analysis) """
    vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
    X = vectorizer.fit_transform(df['Tweets'])

    # Using TruncatedSVD for LSA
    svd = TruncatedSVD(n_components=n_components)
    X_reduced = svd.fit_transform(X)

    # Extracting the components
    terms = vectorizer.get_feature_names_out()
    summary = []
    for i, comp in enumerate(svd.components_):
        terms_comp = zip(terms, comp)
        sorted_terms = sorted(terms_comp, key=lambda x: x[1], reverse=True)[:10]
        summary.append(" ".join([t[0] for t in sorted_terms]))

    return ' '.join(summary)

# Generating summaries
summary_animal = summarize_text(animal_data)
summary_sambahadur = summarize_text(sambahadur_data)

summary_animal, summary_sambahadur


In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming we have two datasets, we'll concatenate them for this example
combined_tweets = pd.concat([animal_data['Tweets'], sambahadur_data['Tweets']])

# Combine the tweets from both DataFrames
texts = pd.concat([animal_data['Tweets'], sambahadur_data['Tweets']]).tolist()

# Create dummy labels for example purposes
# In a real application, these should be actual sentiment labels
labels = [1 if i < len(texts) / 2 else 0 for i in range(len(texts))]


# Tokenizing the tweets
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(combined_tweets)
sequences = tokenizer.texts_to_sequences(combined_tweets)

# Padding the sequences to ensure uniform length
max_sequence_length = 200
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Splitting the data into training and testing sets
train_sequences, test_sequences, train_labels, test_labels = train_test_split(
    padded_sequences, labels, test_size=0.25, random_state=42)

train_sequences.shape, test_sequences.shape, len(train_labels), len(test_labels)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample preprocessing steps
# Assuming 'texts' is your list of tweets and 'labels' are the corresponding sentiments
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=200)

train_labels = np.array(train_labels)
test_labels = np.array(test_labels)
# Split the data into training and testing sets
# train_sequences, test_sequences, train_labels, test_labels

# LSTM Model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_sequences, train_labels, batch_size=32, epochs=10, validation_data=(test_sequences, test_labels))

# Evaluate the model
# model.evaluate(test_sequences, test_labels)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming sentiment_summary_animal and sentiment_summary_sambahadur contain the sentiment distribution

# Convert to DataFrame for easier plotting
df_animal_sentiments = sentiment_summary_animal.reset_index()
df_animal_sentiments.columns = ['Sentiment', 'Percentage']

df_sambahadur_sentiments = sentiment_summary_sambahadur.reset_index()
df_sambahadur_sentiments.columns = ['Sentiment', 'Percentage']

# Plotting
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x='Sentiment', y='Percentage', data=df_animal_sentiments)
plt.title('Sentiment Distribution for Animal Movie')

plt.subplot(1, 2, 2)
sns.barplot(x='Sentiment', y='Percentage', data=df_sambahadur_sentiments)
plt.title('Sentiment Distribution for Sambahadur Movie')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Assuming 'history' is the object returned by the fit method of your model
# Replace 'history' with your actual history object

# Plot training & validation accuracy values
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

plt.show()


In [None]:
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, LSTM, Dense
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.utils import to_categorical
# import numpy as np

# # Sample Data Preparation
# texts = pd.concat([animal_data['Tweets'], sambahadur_data['Tweets']]).tolist()
# labels = ['Positive', 'Neutral', 'Negative']

# # Assuming you have your text data in 'texts' and labels in 'labels'
# # Tokenize the texts
# tokenizer = Tokenizer(num_words=5000)
# tokenizer.fit_on_texts(texts)
# sequences = tokenizer.texts_to_sequences(texts)

# # Padding sequences to ensure uniform length
# max_sequence_length = 200
# padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# # Convert labels to categorical (one-hot encoding)
# label_mapping = {'Positive': 0, 'Neutral': 1, 'Negative': 2}
# labels_numerical = [label_mapping[label] for label in labels]
# labels_categorical = to_categorical(labels_numerical, num_classes=3)

# # Split the data into training and testing sets
# train_sequences, test_sequences, train_labels, test_labels = train_test_split(
#     padded_sequences, labels_categorical, test_size=0.25, random_state=42)

# # LSTM Model
# model = Sequential()
# model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_sequence_length))
# model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
# model.add(Dense(3, activation='softmax')) # 3 neurons for 3 classes

# # Compile the model
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# # Train the model
# model.fit(train_sequences, train_labels, batch_size=32, epochs=10, validation_data=(test_sequences, test_labels))

# # Evaluate the model
# # model.evaluate(test_sequences, test_labels)


In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

texts = pd.concat([animal_data['Tweets'], sambahadur_data['Tweets']]).tolist()
labels = ['Positive', 'Neutral', 'Negative'] * (len(texts) // 3)

if len(labels) < len(texts):
    labels += ['Positive'] * (len(texts) - len(labels))
# Ensure the number of texts matches the number of labels
assert len(texts) == len(labels), "The number of texts and labels must be the same."

# Tokenize the texts
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Padding sequences to ensure uniform length
max_sequence_length = 200
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Convert labels to categorical (one-hot encoding)
label_mapping = {'Positive': 0, 'Neutral': 1, 'Negative': 2}
labels_numerical = [label_mapping[label] for label in labels]
labels_categorical = to_categorical(labels_numerical, num_classes=3)

# Split the data into training and testing sets
train_sequences, test_sequences, train_labels, test_labels = train_test_split(
    padded_sequences, labels_categorical, test_size=0.25, random_state=42, stratify=labels_categorical)

# LSTM Model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=512, input_length=max_sequence_length))
model.add(LSTM(512, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))  # 3 neurons for 3 classes

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_sequences, train_labels, batch_size=32, epochs=10, validation_data=(test_sequences, test_labels))

# Evaluate the model
# model.evaluate(test_sequences, test_labels)


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

texts = pd.concat([animal_data['Tweets'], sambahadur_data['Tweets']]).tolist()
labels = ['Positive', 'Neutral', 'Negative'] * (len(texts) // 3)

if len(labels) < len(texts):
    labels += ['Positive'] * (len(texts) - len(labels))
# Ensure the number of texts matches the number of labels
assert len(texts) == len(labels), "The number of texts and labels must be the same."

# Tokenization and Padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=200)

# Convert labels to one-hot encoding
label_mapping = {'Positive': 0, 'Neutral': 1, 'Negative': 2}
labels_numerical = [label_mapping[label] for label in labels]
labels_categorical = to_categorical(labels_numerical, num_classes=3)

# Split the data
train_sequences, test_sequences, train_labels, test_labels = train_test_split(
    padded_sequences, labels_categorical, test_size=0.25, random_state=42)

# Building the LSTM Model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_sequences, train_labels, batch_size=32, epochs=10, validation_data=(test_sequences, test_labels))

# Evaluate the model
accuracy = model.evaluate(test_sequences, test_labels)[1]
print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

texts = pd.concat([animal_data['Tweets'], sambahadur_data['Tweets']]).tolist()
labels = ['Positive', 'Neutral', 'Negative'] * (len(texts) // 3)

if len(labels) < len(texts):
    labels += ['Positive'] * (len(texts) - len(labels))
# Ensure the number of texts matches the number of labels
assert len(texts) == len(labels), "The number of texts and labels must be the same."

# Tokenization and Padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=200)

# Convert labels to one-hot encoding
label_mapping = {'Positive': 0, 'Neutral': 1, 'Negative': 2}
labels_numerical = [label_mapping[label] for label in labels]
labels_categorical = to_categorical(labels_numerical, num_classes=3)

# Split the data
train_sequences, test_sequences, train_labels, test_labels = train_test_split(
    padded_sequences, labels_categorical, test_size=0.25, random_state=42)

# Building the LSTM Model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_sequences, train_labels, batch_size=32, epochs=10, validation_data=(test_sequences, test_labels))

# Evaluate the model
accuracy = model.evaluate(test_sequences, test_labels)[1]
print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

texts = animal_data['Tweets'].tolist()
labels = ['Positive', 'Neutral', 'Negative'] * (len(texts) // 3)

if len(labels) < len(texts):
    labels += ['Positive'] * (len(texts) - len(labels))
# Ensure the number of texts matches the number of labels
assert len(texts) == len(labels), "The number of texts and labels must be the same."

# Tokenization and Padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=200)

# Convert labels to one-hot encoding
label_mapping = {'Positive': 0, 'Neutral': 1, 'Negative': 2}
labels_numerical = [label_mapping[label] for label in labels]
labels_categorical = to_categorical(labels_numerical, num_classes=3)

# Split the data
train_sequences, test_sequences, train_labels, test_labels = train_test_split(
    padded_sequences, labels_categorical, test_size=0.25, random_state=42)

# Building the LSTM Model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
#model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_sequences, train_labels, batch_size=32, epochs=10, validation_data=(test_sequences, test_labels))

# Evaluate the model
accuracy = model.evaluate(test_sequences, test_labels)[1]
print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
from tensorflow.keras.layers import Bidirectional

# Assuming data preprocessing is done

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(train_sequences, train_labels, batch_size=32, epochs=15, validation_data=(test_sequences, test_labels))


In [None]:
import matplotlib.pyplot as plt

# Assuming 'history' is the return value from model.fit()

# Plot for accuracy
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()

# Plot for loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()

plt.show()


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

tweets = animal_data['Tweets'].tolist() + sambahadur_data['Tweets'].tolist()
# Concatenate all tweets into a single string
combined_text = ' '.join(tweets)  # Replace 'tweets' with your list of tweets


# Generate a word cloud image
wordcloud = WordCloud(background_color='white', max_words=100, contour_width=3, contour_color='steelblue')

# Generate and show the word cloud
wordcloud.generate(combined_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
