# Linguistic Analysis - Trump Speeches

Deep dive into linguistic complexity, readability, and lexical diversity.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json

sns.set_style('whitegrid')
%matplotlib inline


## Load Feature Data


In [None]:
# Load features
data_dir = Path('../data/transformed')
csv_files = list(data_dir.glob('speeches_features_complete_*.csv'))

if csv_files:
    latest_file = max(csv_files, key=lambda p: p.stat().st_mtime)
    print(f"Loading: {latest_file.name}")
    df = pd.read_csv(latest_file)
    print(f"Loaded {len(df)} speeches with {len(df.columns)} features")
else:
    print("No feature data found. Please run feature engineering first.")


## Readability Metrics

Analyze Flesch-Kincaid, Gunning Fog, and other readability indices.


In [None]:
# Readability metrics
readability_cols = [c for c in df.columns if 'readability' in c]

if readability_cols:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    axes = axes.flatten()
    
    for i, col in enumerate(readability_cols[:4]):
        axes[i].hist(df[col], bins=20, edgecolor='black', alpha=0.7)
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')
        axes[i].set_title(col.replace('readability_', '').replace('_', ' ').title())
    
    plt.tight_layout()
    plt.show()
    
    print("\nReadability Statistics:")
    print(df[readability_cols].describe())
