# 05 - Comprehensive Comparison: Public vs Media## INSY 669 Text Analytics | GLP-1 Weight Loss DrugsThis notebook brings together all analyses for a comprehensive comparison:1. TF-IDF keyword differences2. Side effects coverage gap3. Cosine similarity between corpora4. Temporal sentiment trends5. Key findings summary

In [None]:
import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.metrics.pairwise import cosine_similarityfrom wordcloud import WordCloudfrom scipy import statsimport jsonimport warningswarnings.filterwarnings('ignore')plt.style.use('seaborn-v0_8-whitegrid')

In [None]:
df_public = pd.read_csv('../data/public_with_sentiment.csv')df_media = pd.read_csv('../data/media_with_sentiment.csv')with open('../data/analysis_stats.json') as f:    analysis_stats = json.load(f)print("Analysis stats loaded:")for k, v in analysis_stats.items():    print(f"  {k}: {v}")

## 5.1 TF-IDF Keyword ComparisonWhat terms does each corpus emphasize?

In [None]:
# Public TF-IDFtfidf_pub = TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=5)X_pub = tfidf_pub.fit_transform(df_public['clean'])feat_pub = tfidf_pub.get_feature_names_out()mean_pub = np.array(X_pub.mean(axis=0)).flatten()top_pub = [(feat_pub[i], mean_pub[i]) for i in mean_pub.argsort()[-20:][::-1]]# Media TF-IDFtfidf_med = TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=3)X_med = tfidf_med.fit_transform(df_media['clean'])feat_med = tfidf_med.get_feature_names_out()mean_med = np.array(X_med.mean(axis=0)).flatten()top_med = [(feat_med[i], mean_med[i]) for i in mean_med.argsort()[-20:][::-1]]fig, axes = plt.subplots(1, 2, figsize=(16, 8))terms_p, scores_p = zip(*top_pub)axes[0].barh(range(len(terms_p)), scores_p, color='#2196F3', alpha=0.8)axes[0].set_yticks(range(len(terms_p)))axes[0].set_yticklabels(terms_p)axes[0].set_title('Top TF-IDF Terms: Public', fontsize=13, fontweight='bold')axes[0].invert_yaxis()terms_m, scores_m = zip(*top_med)axes[1].barh(range(len(terms_m)), scores_m, color='#FF9800', alpha=0.8)axes[1].set_yticks(range(len(terms_m)))axes[1].set_yticklabels(terms_m)axes[1].set_title('Top TF-IDF Terms: Media', fontsize=13, fontweight='bold')axes[1].invert_yaxis()plt.tight_layout()plt.savefig('../figures/tfidf_comparison.png', dpi=150, bbox_inches='tight')plt.show()

## 5.2 Word Clouds

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))wc1 = WordCloud(width=800, height=400, background_color='white',                 colormap='Blues').generate(' '.join(df_public['clean']))axes[0].imshow(wc1, interpolation='bilinear')axes[0].set_title('Public Opinion', fontsize=14, fontweight='bold')axes[0].axis('off')wc2 = WordCloud(width=800, height=400, background_color='white',                 colormap='Oranges').generate(' '.join(df_media['clean']))axes[1].imshow(wc2, interpolation='bilinear')axes[1].set_title('Media Coverage', fontsize=14, fontweight='bold')axes[1].axis('off')plt.tight_layout()plt.savefig('../figures/wordclouds.png', dpi=150, bbox_inches='tight')plt.show()

## 5.3 Side Effects AnalysisDo users and media discuss the same side effects?

In [None]:
side_effects = ['nausea', 'vomiting', 'diarrhea', 'constipation', 'headache', 'fatigue',                'gastroparesis', 'pancreatitis', 'gallbladder', 'hair loss', 'sulfur burps',                'stomach pain', 'anxiety', 'injection site', 'dizziness']public_texts = ' '.join(df_public['text'].str.lower())media_texts = ' '.join(df_media['text'].str.lower())se_data = []for se in side_effects:    pub_count = public_texts.count(se)    med_count = media_texts.count(se)    se_data.append({'side_effect': se, 'public': pub_count, 'media': med_count})df_se = pd.DataFrame(se_data).sort_values('public', ascending=False)fig, ax = plt.subplots(figsize=(12, 7))x = np.arange(len(df_se))width = 0.35ax.barh(x - width/2, df_se['public'], width, label='Public', color='#2196F3', alpha=0.8)ax.barh(x + width/2, df_se['media'], width, label='Media', color='#FF9800', alpha=0.8)ax.set_yticks(x)ax.set_yticklabels(df_se['side_effect'])ax.set_title('Side Effects: Public Mentions vs Media Coverage', fontsize=14, fontweight='bold')ax.set_xlabel('Mention Count')ax.legend()ax.invert_yaxis()plt.tight_layout()plt.savefig('../figures/side_effects.png', dpi=150, bbox_inches='tight')plt.show()print("\nSide Effect Coverage Gap (Public mentions >> Media):")for _, row in df_se.iterrows():    if row['public'] > 0:        ratio = row['media'] / row['public'] if row['public'] > 0 else 0        gap = '⚠️ UNDERREPORTED' if ratio < 0.3 else '✓ Proportional'        print(f"  {row['side_effect']:20s} Public: {row['public']:>4}  Media: {row['media']:>4}  {gap}")

## 5.4 Cosine Similarity Between Corpora

In [None]:
# Overall corpus similaritytfidf_compare = TfidfVectorizer(max_features=2000, min_df=1)combined = [' '.join(df_public['clean']), ' '.join(df_media['clean'])]X_compare = tfidf_compare.fit_transform(combined)cos_sim = cosine_similarity(X_compare)[0, 1]print(f"Cosine Similarity between Public and Media corpora: {cos_sim:.4f}")print(f"\nInterpretation: {'Moderate overlap' if cos_sim > 0.3 else 'Low overlap'} in vocabulary and themes.")print(f"The corpora share some common terms but use distinctly different language.")

## 5.5 Temporal Sentiment Trends

In [None]:
df_public['month'] = pd.to_datetime(df_public['date']).dt.to_period('M').astype(str)df_media['month'] = pd.to_datetime(df_media['date']).dt.to_period('M').astype(str)pub_monthly = df_public.groupby('month')['compound'].mean()med_monthly = df_media.groupby('month')['compound'].mean()fig, ax = plt.subplots(figsize=(14, 5))ax.plot(pub_monthly.index, pub_monthly.values, 'o-', color='#2196F3', label='Public', linewidth=2)ax.plot(med_monthly.index, med_monthly.values, 's-', color='#FF9800', label='Media', linewidth=2)ax.set_title('Monthly Sentiment Trends: Public vs Media', fontsize=14, fontweight='bold')ax.set_xlabel('Month')ax.set_ylabel('Average Compound Score')ax.legend(fontsize=12)ax.axhline(y=0, color='gray', linestyle='--', alpha=0.3)plt.xticks(rotation=45)plt.tight_layout()plt.savefig('../figures/sentiment_timeline.png', dpi=150, bbox_inches='tight')plt.show()

## 5.6 Summary of Key Findings

In [None]:
print("=" * 60)print("KEY FINDINGS SUMMARY")print("=" * 60)print(f"""1. SENTIMENT GAP   - Public mean sentiment: {analysis_stats['public_mean_sentiment']:.4f}   - Media mean sentiment:  {analysis_stats['media_mean_sentiment']:.4f}   - T-test p-value: {analysis_stats['p_value']:.6f} (significant at α=0.05)   - Public opinion is more negative than media coverage2. LANGUAGE DIFFERENCES     - Public: personal experiences, side effects, costs, weight loss numbers   - Media: clinical trials, market analysis, FDA regulation, public health   - Cosine similarity: {analysis_stats['cosine_similarity']:.4f} (moderate overlap)3. SIDE EFFECTS COVERAGE GAP   - Users frequently discuss: nausea, constipation, sulfur burps, hair loss   - Media underreports: everyday side effects, focuses on severe events   - Practical concerns (cost, availability) dominate public discourse4. TEMPORAL PATTERNS   - Both corpora show sentiment fluctuations over time   - Media sentiment tends to be more stable/neutral   - Public sentiment more volatile, influenced by personal experiences5. BUSINESS IMPLICATIONS   - Pharmaceutical companies should address user-reported side effects   - Insurance coverage and cost remain major barriers   - Media framing may create unrealistic expectations   - Patient support programs should focus on managing common side effects""")print("\n✅ Analysis complete!")