# 03 - Sentiment Analysis## INSY 669 Text Analytics | GLP-1 Weight Loss DrugsThis notebook applies VADER sentiment analysis to both corpora and compares the sentiment distributions between public opinion and media coverage.

In [None]:
import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom nltk.sentiment.vader import SentimentIntensityAnalyzerfrom scipy import statsimport warningswarnings.filterwarnings('ignore')plt.style.use('seaborn-v0_8-whitegrid')sns.set_palette('husl')

## 3.1 Load Processed Data

In [None]:
df_public = pd.read_csv('../data/public_processed.csv')df_media = pd.read_csv('../data/media_processed.csv')print(f"Public corpus: {len(df_public)} | Media corpus: {len(df_media)}")

## 3.2 VADER Sentiment ScoringWe use VADER (Valence Aware Dictionary and sEntiment Reasoner), which is specifically designed for social media text and handles things like capitalization, punctuation emphasis, and slang.

In [None]:
sia = SentimentIntensityAnalyzer()# Score all documentsdf_public['vader_neg'] = df_public['text'].apply(lambda x: sia.polarity_scores(str(x))['neg'])df_public['vader_neu'] = df_public['text'].apply(lambda x: sia.polarity_scores(str(x))['neu'])df_public['vader_pos'] = df_public['text'].apply(lambda x: sia.polarity_scores(str(x))['pos'])df_public['compound'] = df_public['text'].apply(lambda x: sia.polarity_scores(str(x))['compound'])df_media['vader_neg'] = df_media['text'].apply(lambda x: sia.polarity_scores(str(x))['neg'])df_media['vader_neu'] = df_media['text'].apply(lambda x: sia.polarity_scores(str(x))['neu'])df_media['vader_pos'] = df_media['text'].apply(lambda x: sia.polarity_scores(str(x))['pos'])df_media['compound'] = df_media['text'].apply(lambda x: sia.polarity_scores(str(x))['compound'])# Classify sentimentdf_public['sentiment'] = df_public['compound'].apply(    lambda x: 'positive' if x > 0.05 else ('negative' if x < -0.05 else 'neutral'))df_media['sentiment'] = df_media['compound'].apply(    lambda x: 'positive' if x > 0.05 else ('negative' if x < -0.05 else 'neutral'))print("Public sentiment distribution:")print(df_public['sentiment'].value_counts())print(f"\nMedia sentiment distribution:")print(df_media['sentiment'].value_counts())

## 3.3 Sentiment Distribution Comparison

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))axes[0].hist(df_public['compound'], bins=40, color='#2196F3', alpha=0.8, edgecolor='white')axes[0].set_title('Public Opinion Sentiment Distribution', fontsize=14, fontweight='bold')axes[0].set_xlabel('VADER Compound Score')axes[0].set_ylabel('Frequency')axes[0].axvline(x=0, color='red', linestyle='--', alpha=0.5, label='Neutral')axes[0].axvline(x=df_public['compound'].mean(), color='blue', linestyle='-', alpha=0.7, label=f"Mean={df_public['compound'].mean():.3f}")axes[0].legend()axes[1].hist(df_media['compound'], bins=40, color='#FF9800', alpha=0.8, edgecolor='white')axes[1].set_title('Media Sentiment Distribution', fontsize=14, fontweight='bold')axes[1].set_xlabel('VADER Compound Score')axes[1].set_ylabel('Frequency')axes[1].axvline(x=0, color='red', linestyle='--', alpha=0.5, label='Neutral')axes[1].axvline(x=df_media['compound'].mean(), color='darkorange', linestyle='-', alpha=0.7, label=f"Mean={df_media['compound'].mean():.3f}")axes[1].legend()plt.tight_layout()plt.savefig('../figures/sentiment_histograms.png', dpi=150, bbox_inches='tight')plt.show()

## 3.4 Box Plot Comparison

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))data_box = pd.DataFrame({    'Compound Score': pd.concat([df_public['compound'], df_media['compound']]),    'Corpus': ['Public Opinion'] * len(df_public) + ['Media Coverage'] * len(df_media)})sns.boxplot(data=data_box, x='Corpus', y='Compound Score',             palette=['#2196F3', '#FF9800'], ax=ax, width=0.5)ax.set_title('Sentiment Comparison: Public vs Media', fontsize=14, fontweight='bold')ax.axhline(y=0, color='gray', linestyle='--', alpha=0.3)plt.savefig('../figures/sentiment_boxplot.png', dpi=150, bbox_inches='tight')plt.show()

## 3.5 Sentiment Proportions

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))colors = ['#4CAF50', '#F44336', '#9E9E9E']for idx, (df, title) in enumerate([(df_public, 'Public Opinion'), (df_media, 'Media Coverage')]):    counts = df['sentiment'].value_counts()    axes[idx].pie(counts, labels=counts.index, autopct='%1.1f%%',                   colors=colors, startangle=90, textprops={'fontsize': 12})    axes[idx].set_title(title, fontsize=14, fontweight='bold')plt.tight_layout()plt.savefig('../figures/sentiment_pies.png', dpi=150, bbox_inches='tight')plt.show()

## 3.6 Statistical Test

In [None]:
# Independent samples t-testt_stat, p_value = stats.ttest_ind(df_public['compound'], df_media['compound'])print("=" * 50)print("STATISTICAL COMPARISON")print("=" * 50)print(f"\nPublic mean sentiment:  {df_public['compound'].mean():.4f}")print(f"Media mean sentiment:   {df_media['compound'].mean():.4f}")print(f"Difference:             {df_public['compound'].mean() - df_media['compound'].mean():.4f}")print(f"\nT-statistic:            {t_stat:.4f}")print(f"P-value:                {p_value:.6f}")print(f"\nSignificant at α=0.05:  {'Yes' if p_value < 0.05 else 'No'}")print(f"\nConclusion: {'The difference in sentiment between public and media is statistically significant.' if p_value < 0.05 else 'No significant difference found.'}")# Effect size (Cohen's d)d = (df_public['compound'].mean() - df_media['compound'].mean()) / np.sqrt(    (df_public['compound'].std()**2 + df_media['compound'].std()**2) / 2)print(f"\nCohen's d (effect size): {d:.4f}")

## 3.7 Sentiment by Source

In [None]:
# Break down public by source (reddit vs webmd)df_public_source = df_public.copy()fig, ax = plt.subplots(figsize=(10, 6))sns.boxplot(data=df_public_source, x='source', y='compound',             palette=['#2196F3', '#66BB6A'], ax=ax)ax.set_title('Sentiment by Source within Public Corpus', fontsize=14, fontweight='bold')ax.set_xlabel('Source')ax.set_ylabel('Compound Score')plt.show()for src in df_public_source['source'].unique():    subset = df_public_source[df_public_source['source'] == src]    print(f"{src}: mean={subset['compound'].mean():.4f}, std={subset['compound'].std():.4f}")