# Trump Speech Analysis - Visualization Dashboard

Comprehensive visualizations using Plotly (interactive) and Matplotlib/Seaborn (publication-ready).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path
from wordcloud import WordCloud
import json

sns.set_style('whitegrid')
%matplotlib inline


## Load Data


In [None]:
# Load feature data
data_dir = Path('../data/transformed')
csv_files = list(data_dir.glob('speeches_features_complete_*.csv'))

if csv_files:
    latest_file = max(csv_files, key=lambda p: p.stat().st_mtime)
    df = pd.read_csv(latest_file)
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
    print(f"Loaded {len(df)} speeches with {len(df.columns)} features")
else:
    print("No data found")

# Load transformed data for text analysis
json_files = list(data_dir.glob('speeches_nlp_features_*.json'))
if json_files:
    latest_json = max(json_files, key=lambda p: p.stat().st_mtime)
    with open(latest_json, 'r', encoding='utf-8') as f:
        transformed_data = json.load(f)
    print(f"Loaded {len(transformed_data)} transformed speeches")


## 1. Interactive Sentiment Timeline (Plotly)


In [None]:
if 'sentiment_compound' in df.columns and 'date' in df.columns:
    df_sorted = df.sort_values('date').dropna(subset=['date'])
    
    fig = px.line(df_sorted, x='date', y='sentiment_compound', 
                  title='Sentiment Timeline Across Trump Speeches',
                  labels={'date': 'Date', 'sentiment_compound': 'Sentiment (Compound)'},
                  hover_data=['title'])
    
    fig.add_hline(y=0, line_dash="dash", line_color="red", opacity=0.5)
    fig.update_layout(height=500)
    fig.show()


## 2. Word Cloud from All Speeches


In [None]:
# Generate word cloud from cleaned text
cleaned_dir = Path('../data/cleaned')
csv_files = list(cleaned_dir.glob('speeches_cleaned_*.csv'))

if csv_files:
    latest_cleaned = max(csv_files, key=lambda p: p.stat().st_mtime)
    df_text = pd.read_csv(latest_cleaned)
    
    all_text = ' '.join(df_text['cleaned_text'].dropna())
    
    wordcloud = WordCloud(width=1200, height=600, background_color='white',
                          colormap='viridis', max_words=100).generate(all_text)
    
    plt.figure(figsize=(15, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud - All Trump Speeches', fontsize=20, pad=20)
    plt.tight_layout()
    plt.show()


## 3. Emotion Heatmap


In [None]:
# Emotion heatmap
emotion_cols = [c for c in df.columns if c.startswith('emotion_')]

if emotion_cols and 'title' in df.columns:
    emotion_data = df[['title'] + emotion_cols].set_index('title')
    emotion_data.columns = [c.replace('emotion_', '').title() for c in emotion_data.columns]
    
    plt.figure(figsize=(14, 10))
    sns.heatmap(emotion_data, cmap='RdYlGn', center=0, cbar_kws={'label': 'Emotion Score'})
    plt.title('Emotion Heatmap Across Speeches', fontsize=16, pad=20)
    plt.xlabel('Emotion')
    plt.ylabel('Speech')
    plt.tight_layout()
    plt.show()


## 4. N-gram Frequency Bar Charts


In [None]:
# Extract top n-grams from transformed data
if 'transformed_data' in locals():
    from collections import Counter
    
    all_bigrams = Counter()
    all_trigrams = Counter()
    
    for speech in transformed_data:
        ngrams_data = speech.get('ngrams', {})
        if '2gram' in ngrams_data:
            for gram, count in ngrams_data['2gram']:
                all_bigrams[gram] += count
        if '3gram' in ngrams_data:
            for gram, count in ngrams_data['3gram']:
                all_trigrams[gram] += count
    
    # Plot top bigrams
    if all_bigrams:
        top_bigrams = dict(all_bigrams.most_common(15))
        
        plt.figure(figsize=(12, 6))
        plt.barh(list(top_bigrams.keys()), list(top_bigrams.values()), color='steelblue')
        plt.xlabel('Frequency')
        plt.title('Top 15 Bigrams Across All Speeches', fontsize=14)
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()


## 5. Readability Trends


In [None]:
# Readability over time
if 'readability_flesch_kincaid_grade' in df.columns and 'date' in df.columns:
    df_sorted = df.sort_values('date').dropna(subset=['date'])
    
    fig = make_subplots(rows=2, cols=1, 
                        subplot_titles=('Flesch-Kincaid Grade Level', 'Flesch Reading Ease'))
    
    fig.add_trace(go.Scatter(x=df_sorted['date'], 
                             y=df_sorted['readability_flesch_kincaid_grade'],
                             mode='lines+markers', name='FK Grade'),
                  row=1, col=1)
    
    if 'readability_flesch_reading_ease' in df.columns:
        fig.add_trace(go.Scatter(x=df_sorted['date'], 
                                 y=df_sorted['readability_flesch_reading_ease'],
                                 mode='lines+markers', name='Reading Ease',
                                 line=dict(color='orange')),
                      row=2, col=1)
    
    fig.update_xaxes(title_text="Date")
    fig.update_yaxes(title_text="Score", row=1, col=1)
    fig.update_yaxes(title_text="Score", row=2, col=1)
    fig.update_layout(height=700, title_text="Readability Trends Over Time")
    fig.show()


## 6. Political Theme Distribution


In [None]:
# Political keyword clusters
keyword_cols = [c for c in df.columns if c.startswith('keywords_') and c != 'keywords_total']

if keyword_cols:
    keyword_totals = df[keyword_cols].sum()
    keyword_totals.index = [c.replace('keywords_', '').replace('_', ' ').title() for c in keyword_totals.index]
    
    # Pie chart with Plotly
    fig = px.pie(values=keyword_totals.values, names=keyword_totals.index,
                 title='Distribution of Political Themes')
    fig.update_traces(textposition='inside', textinfo='percent+label')
    fig.update_layout(height=600)
    fig.show()


## 7. Psychological Features - Pronoun Analysis


In [None]:
# Pronoun usage patterns
pronoun_cols = [c for c in df.columns if c.startswith('pronoun_')]

if pronoun_cols:
    pronoun_means = df[pronoun_cols].mean()
    pronoun_means.index = [c.replace('pronoun_', '').replace('_', ' ').title() for c in pronoun_means.index]
    
    plt.figure(figsize=(10, 6))
    pronoun_means.sort_values(ascending=True).plot(kind='barh', color='coral')
    plt.xlabel('Average Count per Speech')
    plt.title('Pronoun Usage Patterns', fontsize=14)
    plt.tight_layout()
    plt.show()
