##  1) Vader sentiment analysis:

In [1]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Load the dataset
file_path = r"finaldataset.csv"  
df = pd.read_csv(file_path)

# Print column names to verify
print("Columns in dataset:", df.columns)

# Initialize Vader Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment scores
def get_sentiment_vader(text):
    score = analyzer.polarity_scores(str(text))  
    # Convert to string to avoid errors
    if score['compound'] >= 0.05:
        return "Positive"
    elif score['compound'] <= -0.05:
        return "Negative"
    else:
        return "Neutral"

# Check if 'cleaned_tweet' exists before applying sentiment analysis
if "cleaned_tweet" in df.columns:
    df["Sentiment"] = df["cleaned_tweet"].apply(get_sentiment_vader)
else:
    print("Error: 'cleaned_tweet' column not found in dataset.")

# Display first few rows
print(df.head())

# Save the modified dataset
df.to_csv("Vadersentiment_finaldataset.csv", index=False)
##


Columns in dataset: Index(['year', 'cleaned_tweet'], dtype='object')
   year                                      cleaned_tweet Sentiment
0  2015  procreate coloring book 16 blooming chibi natu...  Positive
1  2015  bear bones vehicles would out sell the planed ...  Negative
2  2015  it sounds like youre picking up on the intense...  Positive
3  2015  blends adventure crypto and wildlife conservat...  Positive
4  2015  that alone raises questions about the nature o...  Negative


In [8]:
import pandas as pd
from tabulate import tabulate

# Load the dataset
file_path = r"Vadersentiment_finaldataset.csv"
df = pd.read_csv(file_path)

# Group by 'year' and count total tweets
summary = df.groupby("year")["Sentiment"].value_counts().unstack().fillna(0)

# Rename columns for clarity
summary.columns = ["Negative", "Neutral", "Positive"]

# Calculate total tweets per year
summary["Popular"] = summary.sum(axis=1)

# Calculate percentages correctly
summary["Positive, %"] = (summary["Positive"] / summary["Popular"]) * 100
summary["Negative, %"] = (summary["Negative"] / summary["Popular"]) * 100
summary["Neutral, %"] = (summary["Neutral"] / summary["Popular"]) * 100

# Convert to integer values where needed
summary = summary.astype({"Positive": int, "Negative": int, "Neutral": int, "Popular": int})
summary = summary.round(2)  # Round percentages to 2 decimal places

# Reset index to move 'year' into columns
summary.reset_index(inplace=True)

# Add total row
total_row = pd.DataFrame({
    "year": ["Total"],
    "Popular": [summary["Popular"].sum()],
    "Positive": [summary["Positive"].sum()],
    "Negative": [summary["Negative"].sum()],
    "Neutral": [summary["Neutral"].sum()],
})

# Correct percentage calculations for the total row
total_row["Positive, %"] = (total_row["Positive"] / total_row["Popular"]) * 100
total_row["Negative, %"] = (total_row["Negative"] / total_row["Popular"]) * 100
total_row["Neutral, %"] = (total_row["Neutral"] / total_row["Popular"]) * 100

# Round percentages
total_row = total_row.round(2)

# Ensure columns match before concatenation
summary = pd.concat([summary, total_row], ignore_index=True)

# Ensure correct column ordering (Year should be the second column)
column_order = ["year", "Popular", "Positive", "Positive, %", "Negative", "Negative, %", "Neutral", "Neutral, %"]
summary = summary[column_order]

# Rename "year" to "Year"
summary.rename(columns={"year": "Year"}, inplace=True)

# Format table using tabulate for proper alignment
formatted_table = tabulate(summary, headers="keys", tablefmt="grid")

# Print the formatted table
print(formatted_table)

# Save to CSV
summary.to_csv("sentiment_summary.csv", index=False)
##


+----+--------+-----------+------------+---------------+------------+---------------+-----------+--------------+
|    | Year   |   Popular |   Positive |   Positive, % |   Negative |   Negative, % |   Neutral |   Neutral, % |
|  0 | 2015   |        93 |         56 |         60.22 |         21 |         22.58 |        16 |        17.2  |
+----+--------+-----------+------------+---------------+------------+---------------+-----------+--------------+
|  1 | 2016   |       294 |        175 |         59.52 |         81 |         27.55 |        38 |        12.93 |
+----+--------+-----------+------------+---------------+------------+---------------+-----------+--------------+
|  2 | 2017   |       240 |         92 |         38.33 |         87 |         36.25 |        61 |        25.42 |
+----+--------+-----------+------------+---------------+------------+---------------+-----------+--------------+
|  3 | 2018   |       390 |        215 |         55.13 |        114 |         29.23 |        61 

## 2)PMI Sentiment_Analysis:

In [18]:
##
import pandas as pd
import math
from collections import defaultdict, Counter
from nrclex import NRCLex

# Load dataset
file_path = r"C:\Users\nivis\OneDrive\Desktop\finaldataset.csv"
df = pd.read_csv(file_path, encoding='utf-8')

# Check required column
if 'cleaned_tweet' not in df.columns:
    raise ValueError("Missing 'cleaned_tweet' column")

# Tokenize
df['tokens'] = df['cleaned_tweet'].astype(str).apply(lambda x: x.lower().split())

# Build vocabulary and frequency
all_tokens = [word for tokens in df['tokens'] for word in tokens]
word_freq = Counter(all_tokens)
total_words = sum(word_freq.values())

# Build co-occurrence dictionary (within each tweet)
co_occurrence = defaultdict(int)
for tokens in df['tokens']:
    unique_tokens = set(tokens)
    for w1 in unique_tokens:
        for w2 in unique_tokens:
            if w1 != w2:
                co_occurrence[(w1, w2)] += 1

# Define domain-specific sentiment words
positive_words = ["good", "great", "excellent", "positive", "happy", "eco-friendly",
                  "renewable", "clean", "green", "sustainable", "alternative", "electric"]

negative_words = ["bad", "terrible", "pollution", "emission", "dirty", "waste",
                  "hazardous", "climate crisis", "global warming", "deforestation", 
                  "fire", "fossil", "smog"]

# PMI Calculation
def calculate_pmi(w1, w2):
    co_count = co_occurrence.get((w1, w2), 0) + 1  # Add-1 smoothing
    p_joint = co_count / total_words
    p_w1 = (word_freq[w1] + 1) / total_words
    p_w2 = (word_freq[w2] + 1) / total_words
    return math.log(p_joint / (p_w1 * p_w2), 2)

# Semantic Orientation (SO) for a word
def semantic_orientation(word):
    if word not in word_freq:
        return 0
    pos_score = sum(calculate_pmi(word, p) for p in positive_words if p in word_freq)
    neg_score = sum(calculate_pmi(word, n) for n in negative_words if n in word_freq)
    return (pos_score - neg_score) / word_freq[word]

# Tweet-level sentiment using SO
def get_sentiment_SO(tokens):
    so_score = sum(semantic_orientation(w) for w in tokens)
    if so_score > 0.1:
        return "Positive"
    elif so_score < -0.1:
        return "Negative"
    else:
        return "Neutral"

df['PMI_Sentiment'] = df['tokens'].apply(get_sentiment_SO)

df.to_csv("finaldataset_with_PMI_sentiment.csv", index=False)

# Preview
print(df[['cleaned_tweet', 'PMI_Sentiment']].head())
df.sample(10)

                                       cleaned_tweet PMI_Sentiment
0  procreate coloring book 16 blooming chibi natu...      Positive
1  bear bones vehicles would out sell the planed ...      Positive
2  it sounds like youre picking up on the intense...      Positive
3  blends adventure crypto and wildlife conservat...      Positive
4  that alone raises questions about the nature o...      Positive


Unnamed: 0,year,cleaned_tweet,tokens,PMI_Sentiment
3311,2024,2 in a couple of years the ummayads literally ...,"[2, in, a, couple, of, years, the, ummayads, l...",Positive
335,2016,enter to win this competition win a personalis...,"[enter, to, win, this, competition, win, a, pe...",Positive
1130,2019,you are done for at least the next 4yrs klaus ...,"[you, are, done, for, at, least, the, next, 4y...",Positive
468,2017,here you go plastic,"[here, you, go, plastic]",Neutral
562,2017,plastic,[plastic],Neutral
1479,2020,climate change is real climate crisis is a hoax,"[climate, change, is, real, climate, crisis, i...",Neutral
1508,2020,new york state amends landmark climate change ...,"[new, york, state, amends, landmark, climate, ...",Positive
3557,2025,they had no intentions to follow through with ...,"[they, had, no, intentions, to, follow, throug...",Positive
1057,2019,emission schedule looks solid excited,"[emission, schedule, looks, solid, excited]",Neutral
2947,2024,hopefully everyone has woke up to the fact tha...,"[hopefully, everyone, has, woke, up, to, the, ...",Positive


In [19]:
import pandas as pd
from tabulate import tabulate

# Load the dataset
file_path = r"finaldataset_with_PMI_sentiment.csv"
df = pd.read_csv(file_path)

# Group by 'year' and count total tweets
summary = df.groupby("year")["PMI_Sentiment"].value_counts().unstack().fillna(0)

# Rename columns for clarity
summary.columns = ["Negative", "Neutral", "Positive"]

# Calculate total tweets per year
summary["Popular"] = summary.sum(axis=1)

# Calculate percentages correctly
summary["Positive, %"] = (summary["Positive"] / summary["Popular"]) * 100
summary["Negative, %"] = (summary["Negative"] / summary["Popular"]) * 100
summary["Neutral, %"] = (summary["Neutral"] / summary["Popular"]) * 100

# Convert to integer values where needed
summary = summary.astype({"Positive": int, "Negative": int, "Neutral": int, "Popular": int})
summary = summary.round(2)  # Round percentages to 2 decimal places

# Reset index to move 'year' into columns
summary.reset_index(inplace=True)

# Add total row
total_row = pd.DataFrame({
    "year": ["Total"],
    "Popular": [summary["Popular"].sum()],
    "Positive": [summary["Positive"].sum()],
    "Negative": [summary["Negative"].sum()],
    "Neutral": [summary["Neutral"].sum()],
})

# Correct percentage calculations for the total row
total_row["Positive, %"] = (total_row["Positive"] / total_row["Popular"]) * 100
total_row["Negative, %"] = (total_row["Negative"] / total_row["Popular"]) * 100
total_row["Neutral, %"] = (total_row["Neutral"] / total_row["Popular"]) * 100

# Round percentages
total_row = total_row.round(2)

# Ensure columns match before concatenation
summary = pd.concat([summary, total_row], ignore_index=True)

# Ensure correct column ordering (Year should be the second column)
column_order = ["year", "Popular", "Positive", "Positive, %", "Negative", "Negative, %", "Neutral", "Neutral, %"]
summary = summary[column_order]

# Rename "year" to "Year"
summary.rename(columns={"year": "Year"}, inplace=True)

# Format table using tabulate for proper alignment
formatted_table = tabulate(summary, headers="keys", tablefmt="grid")

# Print the formatted table
print(formatted_table)

# Save to CSV
summary.to_csv("sentiment_summary.csv", index=False)
##pmi


+----+--------+-----------+------------+---------------+------------+---------------+-----------+--------------+
|    | Year   |   Popular |   Positive |   Positive, % |   Negative |   Negative, % |   Neutral |   Neutral, % |
|  0 | 2015   |        93 |         91 |         97.85 |          0 |          0    |         2 |         2.15 |
+----+--------+-----------+------------+---------------+------------+---------------+-----------+--------------+
|  1 | 2016   |       294 |        285 |         96.94 |          1 |          0.34 |         8 |         2.72 |
+----+--------+-----------+------------+---------------+------------+---------------+-----------+--------------+
|  2 | 2017   |       240 |        222 |         92.5  |          3 |          1.25 |        15 |         6.25 |
+----+--------+-----------+------------+---------------+------------+---------------+-----------+--------------+
|  3 | 2018   |       390 |        374 |         95.9  |          0 |          0    |        16 

## 3) Emotional Analysis

In [22]:
import pandas as pd
from tabulate import tabulate
from nrclex import NRCLex

# Loading the  dataset
file_path = r"Vadersentiment_finaldataset.csv"  
df = pd.read_csv(file_path)

# Function to get dominant emotion for each tweet
def get_dominant_emotion(text):
    try:
        emotion = NRCLex(str(text))
        scores = emotion.raw_emotion_scores
        
        # Keep only the 8 basic emotions
        core_emotions = ['fear', 'anger', 'anticipation', 'trust', 
                         'surprise', 'sadness', 'disgust', 'joy']
        filtered = {k: v for k, v in scores.items() if k in core_emotions}
        
        if filtered:
            return max(filtered, key=filtered.get)
        else:
            return "None"
    except Exception as e:
        print(f"Error: {e}")
        return "None"

# Apply emotion extraction
df['Emotion'] = df['cleaned_tweet'].apply(get_dominant_emotion)

# Group by 'year' and summarize sentiment
summary = df.groupby("year")["Sentiment"].value_counts().unstack().fillna(0)
summary.columns = ["Negative", "Neutral", "Positive"]
summary["Popular"] = summary.sum(axis=1)

# Calculate sentiment percentages
summary["Positive, %"] = (summary["Positive"] / summary["Popular"]) * 100
summary["Negative, %"] = (summary["Negative"] / summary["Popular"]) * 100
summary["Neutral, %"] = (summary["Neutral"] / summary["Popular"]) * 100

# Convert to int and round %
summary = summary.astype({"Positive": int, "Negative": int, "Neutral": int, "Popular": int})
summary = summary.round(2)
summary.reset_index(inplace=True)

# Add dominant emotion per year
emotion_mode = df.groupby("year")["Emotion"].agg(lambda x: x.mode()[0] if not x.mode().empty else "None")
summary["Emotion"] = summary["year"].map(emotion_mode)

# Add total row
total_row = pd.DataFrame({
    "year": ["Total"],
    "Popular": [summary["Popular"].sum()],
    "Positive": [summary["Positive"].sum()],
    "Negative": [summary["Negative"].sum()],
    "Neutral": [summary["Neutral"].sum()],
})
total_row["Positive, %"] = (total_row["Positive"] / total_row["Popular"]) * 100
total_row["Negative, %"] = (total_row["Negative"] / total_row["Popular"]) * 100
total_row["Neutral, %"] = (total_row["Neutral"] / total_row["Popular"]) * 100
total_row["Emotion"] = [df["Emotion"].mode()[0]]  # Most common emotion overall
total_row = total_row.round(2)

# Combine summary and total row
summary = pd.concat([summary, total_row], ignore_index=True)

# Reorder columns
column_order = ["year", "Popular", "Positive", "Positive, %", "Negative", "Negative, %", "Neutral", "Neutral, %", "Emotion"]
summary = summary[column_order]

# Rename for presentation
summary.rename(columns={"year": "Year"}, inplace=True)

# Print table
formatted_table = tabulate(summary, headers="keys", tablefmt="grid")
print(formatted_table)

# Save as CSV
summary.to_csv("sentiment_emotion_summary.csv", index=False)


+----+--------+-----------+------------+---------------+------------+---------------+-----------+--------------+--------------+
|    | Year   |   Popular |   Positive |   Positive, % |   Negative |   Negative, % |   Neutral |   Neutral, % | Emotion      |
|  0 | 2015   |        93 |         56 |         60.22 |         21 |         22.58 |        16 |        17.2  | None         |
+----+--------+-----------+------------+---------------+------------+---------------+-----------+--------------+--------------+
|  1 | 2016   |       294 |        175 |         59.52 |         81 |         27.55 |        38 |        12.93 | trust        |
+----+--------+-----------+------------+---------------+------------+---------------+-----------+--------------+--------------+
|  2 | 2017   |       240 |         92 |         38.33 |         87 |         36.25 |        61 |        25.42 | None         |
+----+--------+-----------+------------+---------------+------------+---------------+-----------+-------

## 4)BERTopic Modeling

In [17]:
import pandas as pd
from bertopic import BERTopic
from nrclex import NRCLex
import nltk
#nltk.download('punkt')
#nltk.download('stopwords')

path = r"Vadersentiment_finaldataset.csv"
df = pd.read_csv(path)
df['cleaned_tweet'] = df['cleaned_tweet'].astype(str)

tweets = df['cleaned_tweet'].tolist()
topic_model = BERTopic(language="english", top_n_words=5)
topics, probs = topic_model.fit_transform(tweets)
df['topic'] = topics

environment_keywords = [
    "biodiversity", "ecology", "air pollution", "emission", "climate change",
    "plastic", "recycling", "global warming", "sustainability", "greenhouse gases", "carbon footprint"
]

def topic_contains_environmental_keyword(topic_words, keywords):
    topic_text = " ".join([word for word, _ in topic_words]).lower()
    return any(kw in topic_text for kw in keywords)

allowed_topic_ids = []
for topic_id in df['topic'].unique():
    topic_words = topic_model.get_topic(topic_id)
    if topic_words and topic_contains_environmental_keyword(topic_words, environment_keywords):
        allowed_topic_ids.append(topic_id)

# Keep only rows with allowed environmental topics
df = df[df['topic'].isin(allowed_topic_ids)]

target_emotions = ["fear", "trust", "anticipation"]

def has_emotion(text, emotion):
    if pd.isna(text) or not isinstance(text, str):
        return False
    try:
        emo = NRCLex(text)
        return emotion in emo.raw_emotion_scores
    except:
        return False

for emotion in target_emotions:
    df[emotion] = df['cleaned_tweet'].apply(lambda x: has_emotion(x, emotion))

def get_topic_labels(topic_ids):
    labels = []
    for topic_id in topic_ids:
        words_weights = topic_model.get_topic(topic_id)
        if words_weights:
            # Pick the single most representative word
            top_word = words_weights[0][0]
            labels.append(top_word)
        else:
            labels.append("No data")
    return labels

emotion_topics = {}

for emotion in target_emotions:
    filtered = df[df[emotion] == True]
    top_topic_ids = filtered['topic'].value_counts().head(5).index.tolist()
    labels = get_topic_labels(top_topic_ids)
    while len(labels) < 5:
        labels.append("")
    emotion_topics[emotion.capitalize()] = labels

top_overall_ids = df['topic'].value_counts().head(5).index.tolist()
overall_labels = get_topic_labels(top_overall_ids)
while len(overall_labels) < 5:
    overall_labels.append("")
emotion_topics['Overall'] = overall_labels

final_df = pd.DataFrame(emotion_topics)
final_df = final_df[["Overall", "Fear", "Trust", "Anticipation"]]

print("\nTopic clusters per social media platform and prevailing emotion (Twitter):")
print(final_df.to_string(index=False))

final_df.to_csv("bertopic_emotion_table.csv", index=False)
print("\nSaved to 'bertopic_emotion_table.csv'")



Topic clusters per social media platform and prevailing emotion (Twitter):
     Overall         Fear        Trust Anticipation
         air          air          air          air
   recycling    recycling    recycling    recycling
     plastic      plastic      science      science
biodiversity      plastic biodiversity biodiversity
     plastic biodiversity      plastic      plastic

Saved to 'bertopic_emotion_table.csv'
