# 01 - Data Collection## INSY 669 Text Analytics | GLP-1 Weight Loss DrugsThis notebook documents our data collection process from three sources:1. **Reddit** (r/Ozempic, r/Semaglutide, r/WegovyWeightLoss)2. **WebMD** (patient reviews)3. **News articles** (major health news outlets)

In [None]:
import pandas as pdimport numpy as npfrom bs4 import BeautifulSoupimport requestsimport timeimport osimport warningswarnings.filterwarnings('ignore')

## 1.1 Reddit Data CollectionWe used web scraping to collect posts from GLP-1 related subreddits. The scraper collects post titles, body text, scores, and comment counts.**Note:** Due to API rate limits and reproducibility, we saved the collected data to CSV. The scraping code below shows our methodology.

In [None]:
# Reddit scraping approach (PRAW-based)# Note: Requires Reddit API credentials"""import prawreddit = praw.Reddit(    client_id='YOUR_CLIENT_ID',    client_secret='YOUR_CLIENT_SECRET',    user_agent='INSY669_TextAnalytics')subreddits = ['Ozempic', 'Semaglutide', 'WegovyWeightLoss']posts = []for sub_name in subreddits:    subreddit = reddit.subreddit(sub_name)    for post in subreddit.hot(limit=300):        posts.append({            'id': post.id,            'subreddit': f'r/{sub_name}',            'text': f'{post.title}. {post.selftext}',            'date': datetime.fromtimestamp(post.created_utc).strftime('%Y-%m-%d'),            'score': post.score,            'num_comments': post.num_comments        })        time.sleep(0.5)  # Rate limiting"""print("Reddit scraping code documented (see above)")print("Loading pre-collected data...")df_reddit = pd.read_csv('../data/reddit_posts.csv')print(f"Reddit posts: {len(df_reddit)}")df_reddit.head()

## 1.2 WebMD Reviews CollectionWe scraped patient reviews from WebMD's drug review pages for Ozempic and Wegovy using BeautifulSoup.

In [None]:
# WebMD scraping approach"""from bs4 import BeautifulSoupimport requestsurls = [    'https://www.webmd.com/drugs/drugreview-174491-ozempic-subcutaneous',    'https://www.webmd.com/drugs/drugreview-180780-wegovy-subcutaneous']reviews = []for url in urls:    for page in range(1, 20):        response = requests.get(f'{url}?page={page}',                               headers={'User-Agent': 'Mozilla/5.0'})        soup = BeautifulSoup(response.text, 'html.parser')        for review in soup.find_all('div', class_='review-comment'):            text = review.find('p').text            rating = review.find('span', class_='rating-score')            reviews.append({                'text': text,                'rating': rating.text if rating else None,                'drug': 'Ozempic' if 'ozempic' in url else 'Wegovy'            })        time.sleep(1)"""print("WebMD scraping code documented (see above)")print("Loading pre-collected data...")df_webmd = pd.read_csv('../data/webmd_reviews.csv')print(f"WebMD reviews: {len(df_webmd)}")df_webmd.head()

## 1.3 News Articles CollectionWe collected news articles from major health and general news outlets covering GLP-1 drugs.

In [None]:
# News scraping approach"""# Approach 1: NewsAPIimport requestsAPI_KEY = 'YOUR_API_KEY'query = 'Ozempic OR Wegovy OR semaglutide weight loss'url = f'https://newsapi.org/v2/everything?q={query}&apiKey={API_KEY}&pageSize=100'response = requests.get(url)articles = response.json()['articles']# Approach 2: Google News scraping with BeautifulSoupfrom bs4 import BeautifulSoupurl = 'https://news.google.com/search?q=Ozempic+Wegovy+weight+loss'response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})soup = BeautifulSoup(response.text, 'html.parser')"""print("News scraping code documented (see above)")print("Loading pre-collected data...")df_news = pd.read_csv('../data/news_articles.csv')print(f"News articles: {len(df_news)}")df_news.head()

## 1.4 Data Summary

In [None]:
print("=" * 50)print("DATA COLLECTION SUMMARY")print("=" * 50)print(f"\nReddit posts:     {len(df_reddit):>6}")print(f"WebMD reviews:    {len(df_webmd):>6}")print(f"News articles:    {len(df_news):>6}")print(f"{'─' * 30}")print(f"Total documents:  {len(df_reddit) + len(df_webmd) + len(df_news):>6}")print(f"\nPublic corpus:    {len(df_reddit) + len(df_webmd):>6} (Reddit + WebMD)")print(f"Media corpus:     {len(df_news):>6} (News articles)")print(f"\nDate range: {df_reddit['date'].min()} to {df_reddit['date'].max()}")print(f"\nReddit subreddits: {df_reddit['subreddit'].unique().tolist()}")print(f"News sources: {df_news['source'].nunique()} unique outlets")