# Data Exploration - COVIDSenti Dataset

Comprehensive analysis of the dataset to understand:
- Total tweet counts
- News vs conversational distribution
- Sentiment distribution
- Example tweets by category

## Setup

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)
pd.set_option('display.max_colwidth', 150)

## Load Dataset

In [None]:
# Load the parsed dataset (with syntactic parses)
df = pd.read_csv("../data/input_data/COVIDSenti/COVIDSenti_full_parsed.csv")

print(f"Total tweets: {len(df):,}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head(3)

## Basic Statistics

In [None]:
# Label distribution
print("Sentiment Distribution:")
print(df['label'].value_counts())
print(f"\nPercentages:")
print(df['label'].value_counts(normalize=True) * 100)

# Tweet length
df['tweet_length'] = df['tweet'].str.len()
df['word_count'] = df['tweet'].str.split().str.len()

print(f"\nTweet Statistics:")
print(f"  Average length: {df['tweet_length'].mean():.0f} characters")
print(f"  Average words: {df['word_count'].mean():.1f}")
print(f"  Min words: {df['word_count'].min()}")
print(f"  Max words: {df['word_count'].max()}")

In [None]:
# Visualize sentiment distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
df['label'].value_counts().plot(kind='bar', ax=ax1, color=['#95a5a6', '#e74c3c', '#2ecc71'])
ax1.set_title('Sentiment Distribution (All Tweets)', fontsize=14)
ax1.set_xlabel('Sentiment')
ax1.set_ylabel('Count')
ax1.set_xticklabels(['Neutral', 'Negative', 'Positive'], rotation=0)

# Pie chart
df['label'].value_counts().plot(kind='pie', ax=ax2, autopct='%1.1f%%', 
                                colors=['#95a5a6', '#e74c3c', '#2ecc71'])
ax2.set_ylabel('')
ax2.set_title('All 90,000 Tweets', fontsize=14)

plt.tight_layout()
plt.show()

## News vs Conversational Filter

**Improved Multi-Signal Filter:**

**Strong news patterns (immediate filter):**
- Title | Format: "Coronavirus | CDC Updates"
- "via @username" (shared news)
- "RT @" (retweets)
- **NEW:** "Organization on Topic:" (e.g., "Wyoming Public Health on Coronavirus:")
- **NEW:** Colon + quote marks (quoted statements)

**Conversational signals (hierarchical):**
1. **First-person pronouns** (STRONG): "I", "my", "me", "we", "our"
   - Always conversational, even with URL
   - Example: "I have to admit #Covid19 sounds better... https://..."

2. **Question marks**: Usually conversational (people asking questions)

3. **Opinion + punctuation**: Opinion words + (? or !) 

4. **Second-person + opinion**: "you/your" + opinion words
   - Distinguishes "your hands" (directive) from "your opinion" (conversational)

5. **URLs without signals**: Likely news if no strong personal language

**Key improvement:** More nuanced than simple pattern matching - considers multiple signals and their combinations!

In [None]:
def is_news_like(tweet):
    tweet_lower = tweet.lower()

    strong_news_patterns = [
        r"^[A-Z][a-z]+ \| ",
        r"\bvia @",
        r"^RT @",
        r"^[A-Z][a-z\s]+ on [A-Z][a-z]+:",
        r':\s*["\u201c\u2018]',
    ]

    news_hashtags = [
        r"#smartnews",
        r"#breakingnews",
        r"#breaking",
        r"#news",
        r"#topstory",
        r"#headline",
        r"#update",
        r"#alert",
        r"#cnn",
        r"#fox",
        r"#bbc",
        r"#msnbc",
        r"#reuters",
    ]

    for pattern in strong_news_patterns:
        if re.search(pattern, tweet):
            return True

    for hashtag in news_hashtags:
        if hashtag in tweet_lower:
            return True

    first_person = [
        "i ",
        "my ",
        "me ",
        "i'm",
        "i've",
        "i'd",
        "i'll",
        "we ",
        "our ",
        "we're",
        "we've",
        "we'll",
    ]
    has_first_person = any(word in tweet_lower for word in first_person)

    has_question = "?" in tweet

    if has_first_person or has_question:
        return False

    if len(tweet.split()) < 5:
        return True

    headline_patterns = [
        r"^[A-Z][a-z\s]+ (man|woman|person|official|doctor|patient|resident)",
        r"\b(reports?|says?|confirms?|announces?|warns?|urges?)\s+(that|about)",
        r"^\w+\s+(is|was|has been|have been)\s+the\s+(first|second|latest)",
    ]

    for pattern in headline_patterns:
        if re.search(pattern, tweet):
            return True

    second_person = ["you ", "your ", "you're", "you've", "you'll"]
    has_second_person = any(word in tweet_lower for word in second_person)

    has_exclamation = "!" in tweet

    opinion_words = [
        "think",
        "feel",
        "believe",
        "hope",
        "wish",
        "hate",
        "love",
        "like",
        "dislike",
        "want",
        "need",
        "afraid",
        "worried",
        "glad",
        "happy",
        "sad",
        "angry",
        "confused",
        "admit",
        "crap",
        "damn",
        "wow",
        "omg",
        "wtf",
        "lol",
        "lmao",
    ]
    has_opinion = any(word in tweet_lower for word in opinion_words)

    has_url = bool(re.search(r"https?://", tweet))

    if has_opinion and has_exclamation:
        return False

    if has_second_person and has_opinion:
        return False

    if has_url:
        return True

    institutional = bool(
        re.search(
            r"\b(CDC|WHO|NIH|FDA|Health Department|Public Health)\b",
            tweet,
            re.IGNORECASE,
        )
    )
    if institutional:
        return True

    return True
    

df['is_news'] = df['tweet'].apply(is_news_like)

news_count = df['is_news'].sum()
conversational_count = (~df['is_news']).sum()

print(f"  News-like tweets: {news_count:,} ({news_count/len(df)*100:.1f}%)")
print(f"  Conversational tweets: {conversational_count:,} ({conversational_count/len(df)*100:.1f}%)")

In [None]:
df[df['is_news']==True]["label"].value_counts() / len(df[df['is_news']==True]) * 100

In [None]:
df[(df['is_news']==True) &  (df['label']=="neg")] ["tweet"]

## Sentiment Distribution: News vs Conversational

In [None]:
df_news = df[df['is_news']]
df_conv = df[~df['is_news']]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# News tweets
df_news['label'].value_counts().plot(kind='bar', ax=ax1, color=['#95a5a6', '#e74c3c', '#2ecc71'])
ax1.set_title(f'News-like Tweets (n={len(df_news):,})', fontsize=12)
ax1.set_xlabel('Sentiment')
ax1.set_ylabel('Count')
ax1.set_xticklabels(['Neutral', 'Negative', 'Positive'], rotation=0)

# Conversational tweets
label_counts = df_conv['label'].value_counts()
label_names = {'neu': 'Neutral', 'neg': 'Negative', 'pos': 'Positive'}
label_colors = {'neu': '#95a5a6', 'neg': '#e74c3c', 'pos': '#2ecc71'}
display_labels = [label_names.get(lbl, lbl) for lbl in label_counts.index]
colors = [label_colors.get(lbl, '#3498db') for lbl in label_counts.index]

label_counts.plot(kind='bar', ax=ax2, color=colors)
ax2.set_title(f'Conversational Tweets (n={len(df_conv):,})', fontsize=12)
ax2.set_xlabel('Sentiment')
ax2.set_ylabel('Count')
ax2.set_xticklabels(display_labels, rotation=0)

plt.tight_layout()
plt.show()

print("\nNews-like sentiment:")
print(df_news['label'].value_counts())
print("\nConversational sentiment:")
print(df_conv['label'].value_counts())

## Example Tweets - News-like

In [None]:
print("="*100)
print("NEWS-LIKE TWEETS (filtered out for ABSA)")
print("="*100)

# Show random news tweets
news_sample = df_news.sample(n=10, random_state=42)
for i, (idx, row) in enumerate(news_sample.iterrows(), 1):
    print(f"\n{i}. [{row['label'].upper()}] {row['tweet']}")

## Example Tweets - Conversational

In [None]:
print("="*100)
print("CONVERSATIONAL TWEETS (kept for ABSA)")
print("="*100)

# Show random conversational tweets
conv_sample = df_conv.sample(n=min(20, len(df_conv)), random_state=42)
for i, (idx, row) in enumerate(conv_sample.iterrows(), 1):
    print(f"\n{i}. [{row['label'].upper()}] {row['tweet']}")

## Conversational Tweets by Sentiment

In [None]:
# Show conversational examples grouped by sentiment
for sentiment in ['neg', 'neu', 'pos']:
    sentiment_tweets = df_conv[df_conv['label'] == sentiment]
    if len(sentiment_tweets) == 0:
        continue
    
    sentiment_name = {'neg': 'NEGATIVE', 'neu': 'NEUTRAL', 'pos': 'POSITIVE'}[sentiment]
    print(f"\n{'='*100}")
    print(f"{sentiment_name} Conversational Tweets (n={len(sentiment_tweets):,})")
    print(f"{'='*100}")
    
    # Show up to 10 examples
    sample = sentiment_tweets.sample(n=min(10, len(sentiment_tweets)), random_state=42)
    for i, (idx, row) in enumerate(sample.iterrows(), 1):
        print(f"\n{i}. {row['tweet']}")

## Tweet Length Analysis

In [None]:
# Compare tweet lengths
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Word count - News
axes[0,0].hist(df_news['word_count'], bins=50, edgecolor='black', alpha=0.7, color='#e74c3c')
axes[0,0].axvline(df_news['word_count'].mean(), color='blue', linestyle='--', label='Mean')
axes[0,0].set_title('News-like Tweets - Word Count')
axes[0,0].set_xlabel('Words')
axes[0,0].set_ylabel('Frequency')
axes[0,0].legend()

# Word count - Conversational
axes[0,1].hist(df_conv['word_count'], bins=50, edgecolor='black', alpha=0.7, color='#2ecc71')
axes[0,1].axvline(df_conv['word_count'].mean(), color='blue', linestyle='--', label='Mean')
axes[0,1].set_title('Conversational Tweets - Word Count')
axes[0,1].set_xlabel('Words')
axes[0,1].set_ylabel('Frequency')
axes[0,1].legend()

# Character length - News
axes[1,0].hist(df_news['tweet_length'], bins=50, edgecolor='black', alpha=0.7, color='#e74c3c')
axes[1,0].axvline(df_news['tweet_length'].mean(), color='blue', linestyle='--', label='Mean')
axes[1,0].set_title('News-like Tweets - Character Length')
axes[1,0].set_xlabel('Characters')
axes[1,0].set_ylabel('Frequency')
axes[1,0].legend()

# Character length - Conversational
axes[1,1].hist(df_conv['tweet_length'], bins=50, edgecolor='black', alpha=0.7, color='#2ecc71')
axes[1,1].axvline(df_conv['tweet_length'].mean(), color='blue', linestyle='--', label='Mean')
axes[1,1].set_title('Conversational Tweets - Character Length')
axes[1,1].set_xlabel('Characters')
axes[1,1].set_ylabel('Frequency')
axes[1,1].legend()

plt.tight_layout()
plt.show()

print("Average word count:")
print(f"  News-like: {df_news['word_count'].mean():.1f}")
print(f"  Conversational: {df_conv['word_count'].mean():.1f}")

## Export Conversational Subset

In [None]:
output_path = "data/COVIDSenti/COVIDSenti_conversational_only.csv"
df_conv.to_csv(output_path, index=False)

print(f"Saved {len(df_conv):,} conversational tweets to:")
print(f"  {output_path}")