In [None]:
# Cell 1: Install required packages
%pip install pymongo python-dotenv pandas seaborn matplotlib ipython boto3 anthropic

Configuration

In [1]:
# Cell 2: Configuration Setup
import os
from dotenv import load_dotenv
from data_extractor import MongoDBExtractor
from datetime import datetime, timedelta
import pandas as pd

# Load environment variables
load_dotenv()

# Initialize MongoDB extractor
extractor = MongoDBExtractor()
extractor.connect()

# 1. Selected users for analysis (subset of 3 from config.py)
test_users = [
    'ptr_dvd',      # Active Kohelet Forum member
    'SagiBarmak',   # Prominent voice
    #'KoheletForum'  # Official account
]
print("Users to analyze:", test_users)

# 2. Define analysis periods
pre_war_end = '2023-10-07'    # Day before the war
post_war_start = '2024-12-30'  # War start date
days_back = 90                 # Days to analyze for each period

# Create timestamp objects for reference
pre_war_end_date = datetime.strptime(pre_war_end, '%Y-%m-%d')
post_war_start_date = datetime.strptime(post_war_start, '%Y-%m-%d')

# 3. Additional parameters
MODEL_NAME = "anthropic.claude-3-haiku-20240307-v1:0"  # Current model from analyzer.py

# Directory structure for data organization
DATA_DIRS = {
    'raw': os.path.join('data', 'raw'),
    'pre_war': os.path.join('data', 'raw', 'pre_war'),
    'post_war': os.path.join('data', 'raw', 'post_war'),
    'analysis': os.path.join('data', 'analysis'),
    'cleaned': os.path.join('data', 'cleaned')
}

# Create necessary directories
for dir_path in DATA_DIRS.values():
    os.makedirs(dir_path, exist_ok=True)

print(f"\nAnalysis Configuration:")
print(f"Pre-war period: {pre_war_end_date - timedelta(days=days_back)} to {pre_war_end_date}")
print(f"Post-war period: {post_war_start_date - timedelta(days=days_back)} to {post_war_start_date}")
print(f"Days analyzed per period: {days_back}")
print(f"Model: {MODEL_NAME}")

Successfully connected to MongoDB
Users to analyze: ['ptr_dvd', 'SagiBarmak']

Analysis Configuration:
Pre-war period: 2023-07-09 00:00:00 to 2023-10-07 00:00:00
Post-war period: 2024-10-01 00:00:00 to 2024-12-30 00:00:00
Days analyzed per period: 90
Model: anthropic.claude-3-haiku-20240307-v1:0


Fetch data

In [2]:
# Cell 3: Fetch and Clean Data
from tweet_cleaner import TweetCleaner
print("\nFetching and cleaning data for both periods...")

# Initialize tweet cleaner with all parameters enabled
cleaner = TweetCleaner(min_words=7, remove_mentions=True, remove_urls=True)

# Process pre-war data
print("\nProcessing pre-war period data...")
pre_war_df = extractor.extract_tweets_by_date_range(
    reference_date=pre_war_end,
    days_back=days_back,
    usernames=test_users,
    period_label='pre_war'
)
pre_war_cleaned = cleaner.clean_tweets(pre_war_df, period_label='pre_war')
print(f"Pre-war tweets after cleaning: {len(pre_war_cleaned)}")

# Process post-war data
print("\nProcessing post-war period data...")
post_war_df = extractor.extract_tweets_by_date_range(
    reference_date=post_war_start,
    days_back=days_back,
    usernames=test_users,
    period_label='post_war'
)
post_war_cleaned = cleaner.clean_tweets(post_war_df, period_label='post_war')
print(f"Post-war tweets after cleaning: {len(post_war_cleaned)}")


Fetching and cleaning data for both periods...

Processing pre-war period data...
Successfully connected to MongoDB
Fetching tweets from 2023-07-09 00:00:00 to 2023-10-07 23:59:59
Using timestamps from 1688850000 to 1696712399
Fetched 190 tweets for ptr_dvd
Fetched 161 tweets for SagiBarmak
Saved raw data to: data\raw\pre_war\tweets_pre_war_20250112_144506.csv

Cleaning tweets...
- Removing URLs
- Removing @mentions
- Filtering tweets with less than 7 words

Tweet counts before and after cleaning:
----------------------------------------------------------------------
SagiBarmak           - original:  161, cleaned:  118 (removed:   43,   26.7%)
ptr_dvd              - original:  190, cleaned:  142 (removed:   48,   25.3%)
----------------------------------------------------------------------
Total tweets - original: 351, after cleaning: 260
Total removed: 91 (25.9%)

Saved cleaned tweets to: data\cleaned\pre_war\cleaned_pre_war_20250112_144507.csv
Pre-war tweets after cleaning: 260

Pro

Basic users analysis

In [4]:
# Cell 4: Analyze Individual Users
from analyzer import TweetAnalyzer
print("\nAnalyzing individual users for both periods...")

# Initialize analyzer
analyzer = TweetAnalyzer(batch_size=50, max_retries=3)

# Process pre-war period
print("\nAnalyzing pre-war period...")
pre_war_analyses = []
for username in pre_war_cleaned['author_username'].unique():
    user_tweets = pre_war_cleaned[pre_war_cleaned['author_username'] == username].to_dict('records')
    analysis = analyzer.analyze_user_tweets(username, user_tweets)
    pre_war_analyses.append(analysis)
pre_war_merged = analyzer.merge_user_analyses(pd.concat(pre_war_analyses), period_label='pre_war')
print(f"Completed pre-war analysis for {len(pre_war_merged)} users")

# Process post-war period
print("\nAnalyzing post-war period...")
post_war_analyses = []
for username in post_war_cleaned['author_username'].unique():
    user_tweets = post_war_cleaned[post_war_cleaned['author_username'] == username].to_dict('records')
    analysis = analyzer.analyze_user_tweets(username, user_tweets)
    post_war_analyses.append(analysis)
post_war_merged = analyzer.merge_user_analyses(pd.concat(post_war_analyses), period_label='post_war')
print(f"Completed post-war analysis for {len(post_war_merged)} users")


Analyzing individual users for both periods...

Analyzing pre-war period...

Analyzing tweets for @ptr_dvd
Total tweets: 142
Number of batches: 3
✓ Batch 1/3 completed
✓ Batch 2/3 completed
✓ Batch 3/3 completed

Completed analysis for @ptr_dvd: 3 batches processed

Analyzing tweets for @SagiBarmak
Total tweets: 118
Number of batches: 3
✓ Batch 1/3 completed
✓ Batch 2/3 completed
✓ Batch 3/3 completed

Completed analysis for @SagiBarmak: 3 batches processed

Analyzing tweets for @KoheletForum
Total tweets: 70
Number of batches: 2
✓ Batch 1/2 completed
✓ Batch 2/2 completed

Completed analysis for @KoheletForum: 2 batches processed

Merging analyses for @KoheletForum
Total batches to analyze: 2

Merging analyses for @SagiBarmak
Total batches to analyze: 3

Merging analyses for @ptr_dvd
Total batches to analyze: 3

Saved merged analysis to: data\analysis\pre_war\merged_analysis_pre_war_20250112_113226.csv
Completed pre-war analysis for 3 users

Analyzing post-war period...

Analyzing tw

Enhanced users analysis

In [3]:
# Cell 4: Analyze Individual Users
from analyzer import TweetAnalyzer
from analyzer_enhanced import EnhancedTweetAnalyzer
print("\nAnalyzing individual users for both periods...")

# Initialize analyzers
basic_analyzer = TweetAnalyzer(batch_size=50, max_retries=3)
enhanced_analyzer = EnhancedTweetAnalyzer(batch_size=50, max_retries=3)

# Process pre-war period
print("\nAnalyzing pre-war period...")

# Step 1: Basic Analysis
pre_war_analyses = []
for username in pre_war_cleaned['author_username'].unique():
    user_tweets = pre_war_cleaned[pre_war_cleaned['author_username'] == username].to_dict('records')
    analysis = basic_analyzer.analyze_user_tweets(username, user_tweets)
    pre_war_analyses.append(analysis)
pre_war_merged = basic_analyzer.merge_user_analyses(pd.concat(pre_war_analyses), period_label='pre_war')
print(f"Completed basic pre-war analysis for {len(pre_war_merged)} users")

# Step 2: Enhanced Analysis (passing both merged analysis and cleaned tweets)
pre_war_enhanced = enhanced_analyzer.merge_user_analyses_enhanced(
    df=pre_war_merged,
    tweets_df=pre_war_cleaned,
    period_label='pre_war'
)
print(f"Completed enhanced pre-war analysis")

# Process post-war period
print("\nAnalyzing post-war period...")

# Step 1: Basic Analysis
post_war_analyses = []
for username in post_war_cleaned['author_username'].unique():
    user_tweets = post_war_cleaned[post_war_cleaned['author_username'] == username].to_dict('records')
    analysis = basic_analyzer.analyze_user_tweets(username, user_tweets)
    post_war_analyses.append(analysis)
post_war_merged = basic_analyzer.merge_user_analyses(pd.concat(post_war_analyses), period_label='post_war')
print(f"Completed basic post-war analysis for {len(post_war_merged)} users")

# Step 2: Enhanced Analysis (passing both merged analysis and cleaned tweets)
post_war_enhanced = enhanced_analyzer.merge_user_analyses_enhanced(
    df=post_war_merged,
    tweets_df=post_war_cleaned,
    period_label='post_war'
)
print(f"Completed enhanced post-war analysis")


Analyzing individual users for both periods...

Analyzing pre-war period...

Analyzing tweets for @ptr_dvd
Total tweets: 142
Number of batches: 3
✓ Batch 1/3 completed
✓ Batch 2/3 completed
✓ Batch 3/3 completed

Completed analysis for @ptr_dvd: 3 batches processed

Analyzing tweets for @SagiBarmak
Total tweets: 118
Number of batches: 3
✓ Batch 1/3 completed
✓ Batch 2/3 completed
✓ Batch 3/3 completed

Completed analysis for @SagiBarmak: 3 batches processed

Merging analyses for @SagiBarmak
Total batches to analyze: 3

Merging analyses for @ptr_dvd
Total batches to analyze: 3

Saved merged analysis to: data\analysis\pre_war\merged_analysis_pre_war_20250112_144549.csv
Completed basic pre-war analysis for 2 users

Adding enhanced metrics for pre_war period...

Analyzing enhanced metrics for @SagiBarmak
Found 118 tweets for enhanced analysis

Analyzing enhanced metrics for @SagiBarmak
Total tweets: 118
Number of batches: 3
✓ Enhanced Batch 1/3 completed

Raw LLM response for batch 1:
{
 