# Exploratory Data Analysis on Reddit Sentiment Data

This notebook is used for exploratory data analysis (EDA) on the collected Reddit data. It includes visualizations and insights derived from the data to understand public sentiment across Canadian provinces.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.database.mongodb_client import MongoDBClient
from src.analysis.sentiment_analyzer import analyze_sentiment

# Set up MongoDB client
mongo_client = MongoDBClient()
db = mongo_client.get_database()

# Load data from MongoDB
reddit_data = pd.DataFrame(list(db.reddit_posts.find()))
reddit_data.head()

In [2]:
# Data Cleaning and Preprocessing
reddit_data.dropna(subset=['content'], inplace=True)
reddit_data['content'] = reddit_data['content'].str.lower()  # Normalize text

# Analyze sentiment
reddit_data['sentiment'] = reddit_data['content'].apply(analyze_sentiment)
reddit_data[['title', 'sentiment']].head()

In [3]:
# Visualizing Sentiment Distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=reddit_data, x='sentiment')
plt.title('Sentiment Distribution of Reddit Posts')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [4]:
# Visualizing Sentiment by Province
plt.figure(figsize=(12, 8))
sns.boxplot(data=reddit_data, x='province', y='sentiment')
plt.title('Sentiment Scores by Province')
plt.xticks(rotation=45)
plt.xlabel('Province')
plt.ylabel('Sentiment Score')
plt.show()

In [5]:
# Additional visualizations can be added here
# For example, trends over time, word clouds, etc.