# 04 - Word Associations (PMI & Lift)## INSY 669 Text Analytics | GLP-1 Weight Loss DrugsThis notebook computes word associations using:1. **Pointwise Mutual Information (PMI)** - measures how much more likely two words co-occur than by chance2. **Lift** - ratio of observed co-occurrence to expected co-occurrence3. **MDS Visualization** - projects document similarity into 2D space

In [None]:
import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.manifold import MDSfrom sklearn.metrics.pairwise import cosine_similarityfrom collections import Counterimport warningswarnings.filterwarnings('ignore')plt.style.use('seaborn-v0_8-whitegrid')

In [None]:
df_public = pd.read_csv('../data/public_processed.csv')df_media = pd.read_csv('../data/media_processed.csv')

## 4.1 PMI and Lift Computation

In [None]:
def compute_pmi_lift(texts, target_word, top_n=15, min_count=5):    """Compute PMI and Lift for co-occurrence with target_word."""    all_tokens = [set(doc.split()) for doc in texts]    N = len(all_tokens)        target_count = sum(1 for tokens in all_tokens if target_word in tokens)    p_target = target_count / N        if target_count == 0:        return pd.DataFrame()        word_counts = Counter()    co_counts = Counter()    for tokens in all_tokens:        for w in tokens:            word_counts[w] += 1            if target_word in tokens and w != target_word:                co_counts[w] += 1        results = []    for word, co_count in co_counts.items():        if word_counts[word] < min_count:            continue        p_word = word_counts[word] / N        p_co = co_count / N        if p_target * p_word > 0:            pmi = np.log2(p_co / (p_target * p_word))            lift = p_co / (p_target * p_word)        else:            continue        results.append({            'word': word, 'pmi': round(pmi, 3),             'lift': round(lift, 3), 'co_count': co_count        })        df = pd.DataFrame(results)    if len(df) == 0:        return df    return df.sort_values('pmi', ascending=False).head(top_n)

## 4.2 Associations with Key Drug Terms

In [None]:
target_words = ['ozempic', 'wegovy', 'weight', 'nausea']for target in target_words:    print(f"\n{'='*60}")    print(f"ASSOCIATIONS WITH '{target.upper()}'")    print(f"{'='*60}")        pmi_pub = compute_pmi_lift(df_public['clean'].tolist(), target)    pmi_med = compute_pmi_lift(df_media['clean'].tolist(), target)        if len(pmi_pub) > 0:        print(f"\n--- Public Corpus (top 10) ---")        print(pmi_pub[['word', 'pmi', 'lift', 'co_count']].head(10).to_string(index=False))        if len(pmi_med) > 0:        print(f"\n--- Media Corpus (top 10) ---")        print(pmi_med[['word', 'pmi', 'lift', 'co_count']].head(10).to_string(index=False))

## 4.3 PMI Visualization

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))for idx, target in enumerate(['ozempic', 'wegovy', 'weight', 'nausea']):    ax = axes[idx // 2, idx % 2]    pmi_pub = compute_pmi_lift(df_public['clean'].tolist(), target, top_n=10)    if len(pmi_pub) > 0:        ax.barh(pmi_pub['word'], pmi_pub['pmi'], color='#2196F3', alpha=0.8)    ax.set_title(f'PMI with "{target}" (Public)', fontweight='bold')    ax.invert_yaxis()plt.tight_layout()plt.savefig('../figures/pmi_grid.png', dpi=150, bbox_inches='tight')plt.show()

## 4.4 MDS VisualizationMultidimensional Scaling (MDS) projects the high-dimensional TF-IDF space into 2D, preserving pairwise distances. This reveals how public and media documents cluster.

In [None]:
# Combine corporaall_clean = pd.concat([df_public['clean'], df_media['clean']])all_labels = ['Public'] * len(df_public) + ['Media'] * len(df_media)# TF-IDF on combinedtfidf = TfidfVectorizer(max_features=1000, min_df=5)X = tfidf.fit_transform(all_clean)# Sample for MDS visualizationnp.random.seed(42)n_sample = 200idx_pub = np.random.choice(len(df_public), n_sample//2, replace=False)idx_med = np.random.choice(range(len(df_public), len(df_public)+len(df_media)),                            n_sample//2, replace=False)idx_sample = np.concatenate([idx_pub, idx_med])X_sample = X[idx_sample]labels_sample = [all_labels[i] for i in idx_sample]# Compute distance matrixdist_matrix = 1 - cosine_similarity(X_sample)np.fill_diagonal(dist_matrix, 0)dist_matrix = np.maximum(dist_matrix, 0)# Fit MDSmds = MDS(n_components=2, dissimilarity='precomputed', random_state=42, max_iter=300)coords = mds.fit_transform(dist_matrix)# Plotfig, ax = plt.subplots(figsize=(10, 8))colors = ['#2196F3' if l == 'Public' else '#FF9800' for l in labels_sample]ax.scatter(coords[:, 0], coords[:, 1], c=colors, alpha=0.6, s=30)ax.scatter([], [], c='#2196F3', label='Public Opinion', s=60)ax.scatter([], [], c='#FF9800', label='Media Coverage', s=60)ax.legend(fontsize=12)ax.set_title('MDS Plot: Public vs Media Document Similarity', fontsize=14, fontweight='bold')ax.set_xlabel('Dimension 1')ax.set_ylabel('Dimension 2')plt.savefig('../figures/mds_plot.png', dpi=150, bbox_inches='tight')plt.show()print(f"MDS stress: {mds.stress_:.4f}")