In [18]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from wordcloud import WordCloud
from collections import Counter
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
import nltk

# Download required NLTK resources
nltk.download("vader_lexicon")
nltk.download("stopwords")

# --- CONFIG ---
DATA_URL = "https://raw.githubusercontent.com/luminati-io/Social-media-dataset-samples/main/Facebook-datasets.csv"
OUTPUT_DIR = "sentiment_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- LOAD DATA ---
print("Loading dataset...")
df = pd.read_csv(DATA_URL, low_memory=False)

# --- COLUMN DETECTION ---
text_candidates = [c for c in df.columns if any(k in c.lower() for k in ["comment", "text", "post", "message", "description"])]
if not text_candidates:
    raise ValueError("No text-like column found.")
text_col = text_candidates[0]

date_candidates = [c for c in df.columns if any(k in c.lower() for k in ["date", "time", "created"])]
date_col = date_candidates[0] if date_candidates else None

# --- TEXT PREPROCESSING ---
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text) if pd.notna(text) else ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"[^a-z0-9\s']", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

print("Preprocessing text...")

# ✅ FIX: Ensure we’re working with a Series
text_series = df[text_col]
if isinstance(text_series, pd.DataFrame):
    text_series = text_series.astype(str).agg(" ".join, axis=1)
else:
    text_series = text_series.astype(str)

texts_raw = text_series.tolist()
df["text_clean"] = [preprocess_text(x) for x in tqdm(texts_raw, desc="Cleaning")]

# --- DATE PARSING ---
if date_col:
    df["created_at"] = pd.to_datetime(df[date_col], errors="coerce")
else:
    df["created_at"] = pd.NaT

df = df[df["text_clean"].str.strip().astype(bool)].reset_index(drop=True)

# --- SENTIMENT SCORING ---
print("Scoring sentiment...")
sid = SentimentIntensityAnalyzer()
scores = df["text_clean"].apply(sid.polarity_scores)
scores_df = pd.DataFrame(list(scores))
df = pd.concat([df, scores_df], axis=1)

df["sentiment"] = df["compound"].apply(lambda c: "positive" if c >= 0.05 else "negative" if c <= -0.05 else "neutral")

# --- AGGREGATIONS ---
sentiment_counts = df["sentiment"].value_counts().reset_index(name="count")
sentiment_counts.to_csv(f"{OUTPUT_DIR}/sentiment_counts.csv", index=False)

if df["created_at"].notna().any():
    df["date"] = df["created_at"].dt.date
    time_agg = df.groupby("date")["compound"].agg(["mean", "count"]).reset_index()
    time_agg.to_csv(f"{OUTPUT_DIR}/time_agg.csv", index=False)

    daily_sent = df.groupby(["date", "sentiment"]).size().unstack(fill_value=0).reset_index()
    daily_sent.to_csv(f"{OUTPUT_DIR}/daily_sentiment_counts.csv", index=False)
else:
    time_agg = None
    daily_sent = None

# --- VISUALIZATION ---
sns.set(style="whitegrid")

# Sentiment distribution
plt.figure(figsize=(6,4))
sns.barplot(x="sentiment", y="count", data=sentiment_counts)
plt.title("Sentiment Distribution")
plt.savefig(f"{OUTPUT_DIR}/sentiment_distribution.png")
plt.close()

# Compound score distribution
plt.figure(figsize=(8,5))
sns.violinplot(x="sentiment", y="compound", data=df)
plt.title("Compound Score by Sentiment")
plt.savefig(f"{OUTPUT_DIR}/compound_distribution.png")
plt.close()

# Time series plots
if time_agg is not None:
    plt.figure(figsize=(10,5))
    sns.lineplot(x="date", y="mean", data=time_agg)
    plt.title("Daily Average Sentiment")
    plt.savefig(f"{OUTPUT_DIR}/daily_compound_mean.png")
    plt.close()

    daily_sent.set_index("date").plot.area(figsize=(10,5), alpha=0.6)
    plt.title("Daily Sentiment Counts")
    plt.savefig(f"{OUTPUT_DIR}/daily_sentiment_stacked.png")
    plt.close()

# Word cloud
stop_words = set(stopwords.words("english"))
wordcloud = WordCloud(width=1200, height=600, background_color="white", stopwords=stop_words).generate(" ".join(df["text_clean"]))
plt.figure(figsize=(12,6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig(f"{OUTPUT_DIR}/wordcloud.png")
plt.close()

# Top words
tokenizer = TreebankWordTokenizer()

def get_top_words(texts, n=30):
    counter = Counter()
    for t in texts:
        tokens = tokenizer.tokenize(t)
        for tk in tokens:
            if tk.isalpha() and tk not in stop_words:
                counter[tk] += 1
    return counter.most_common(n)

top_words = get_top_words(df["text_clean"].tolist())
pd.DataFrame(top_words, columns=["word", "count"]).to_csv(f"{OUTPUT_DIR}/top_words.csv", index=False)

print("✅ Analysis complete. Results saved to:", OUTPUT_DIR)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\OWAIS\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OWAIS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading dataset...
Preprocessing text...


Cleaning: 100%|██████████| 1000/1000 [00:00<00:00, 87058.49it/s]
  df["created_at"] = pd.to_datetime(df[date_col], errors="coerce")


Scoring sentiment...
✅ Analysis complete. Results saved to: sentiment_output
