In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import plotly.express as px
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# --- 1. DOWNLOAD NLTK HELPERS (Only need to run once) ---
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

In [None]:
# --- 2. LOAD YOUR DATA ---
try:
    df = pd.read_csv('../data/raw_tweets.csv')
    print(f"Successfully loaded {len(df)} tweets.")
except FileNotFoundError:
    print("ERROR: raw_tweets.csv not found.")
    print("Please run the '1_data_collection.ipynb' notebook first.")

In [None]:
# --- 3. CREATE TEXT CLEANING FUNCTION ---
lemmatizer = WordNetLemmatizer()

# Add custom stopwords relevant to our topic
stop_words = set(stopwords.words('english'))
custom_stopwords = ['flipkart', 'amazon', 'sale', 'diwali', 'great', 'indian', 'festival', 'big', 'billion', 'days', 'rt']
stop_words.update(custom_stopwords)

def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # Remove URLs
    text = re.sub(r'@\w+|#\w+', '', text)          # Remove mentions and hashtags
    text = text.lower()                           # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)           # Remove punctuation
    tokens = nltk.word_tokenize(text)             # Tokenize
    
    # Remove stopwords and lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    
    return " ".join(cleaned_tokens)

In [None]:
# --- 4. APPLY THE FUNCTION ---
df['CleanedText'] = df['Text'].apply(clean_text)

print("\nText cleaning complete. Here's a sample:")
print(df[['Text', 'CleanedText']].head())

# --- 5. SAVE THE PROCESSED FILE ---
output_path = '../data/processed_tweets.csv'
df.to_csv(output_path, index=False)
print(f"\nCleaned data saved to {output_path}")

In [None]:
# Install VADER for sentiment analysis
!pip install vaderSentiment

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np

# Load the cleaned data
df = pd.read_csv('../data/processed_tweets.csv')

analyzer = SentimentIntensityAnalyzer()

# Get sentiment scores
df['SentimentScore'] = df['CleanedText'].astype(str).apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Classify sentiment based on the score
conditions = [
    (df['SentimentScore'] >= 0.05),
    (df['SentimentScore'] <= -0.05),
    (df['SentimentScore'] > -0.05) & (df['SentimentScore'] < 0.05)
]
values = ['Positive', 'Negative', 'Neutral']

df['Sentiment'] = np.select(conditions, values, default='Neutral')

# Save the *final* processed file with sentiment scores
df.to_csv('../data/processed_tweets.csv', index=False)

print("Sentiment analysis complete and file updated.")
print(df[['CleanedText', 'SentimentScore', 'Sentiment']].head())

In [None]:
# Install nbformat for Plotly
!pip install nbformat

In [None]:
# 1. Sentiment Distribution Pie Chart
sentiment_counts = df['Sentiment'].value_counts()

fig_pie = px.pie(values=sentiment_counts.values, 
                 names=sentiment_counts.index, 
                 title="Sentiment Distribution of Diwali Sale Tweets",
                 color=sentiment_counts.index,
                 color_discrete_map={'Positive':'#34A853', 'Negative':'#EA4335', 'Neutral':'#FBBC05'})

fig_pie.show()

In [None]:
# 2. Word Cloud for all tweets
all_text = " ".join(tweet for tweet in df['CleanedText'].astype(str))

if all_text.strip(): # Check if the string is not empty
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Most Common Words in Tweets')
    plt.show()
else:
    print("No words found to generate a word cloud.")