In [4]:
import json
import pandas as pd
import altair as alt
import sys
sys.path.append("../scripts")
from data_processing import *
import geopandas as gpd
import unidecode


In [5]:
# Load raw data
with open("../data/processed/article_transcripts.json", "r", encoding="utf-8") as f:
    data = json.load(f)
# Convert to DataFrame
df = flatten_data(data)

In [7]:
# Visualization style parameters

# Color palette
color_president = "#9B2915"      # Rufous
color_journalist = "#254441"     # Dark slate gray
color_officials = "#BD8B9C"      # Puce
color_others = "#1EA896"         # Persian green
color_more = "#E9B872"           # Earth yellow
color_gray = "#4f4f4d"           # Medium gray

# Altair theme (base style)
alt.themes.enable('default')
base_props = {
    "width": 600,
    "height": 900
}

# Global text properties
TITLE_FONT = "Helvetica Neue"
LABEL_FONT = "Helvetica Neue"
alt.themes.register('mananeras_theme', lambda: {
    "config": {
        "title": {"fontSize": 22, "font": TITLE_FONT, "anchor": "middle", "color": "#333"},
        "axis": {"labelFont": LABEL_FONT, "titleFont": LABEL_FONT, "labelColor": "#333"},
        "view": {"strokeWidth": 0}
    }
})
alt.themes.enable('mananeras_theme')

ThemeRegistry.enable('mananeras_theme')

### 1. Conference length by weekday
The goal of this visualization is to analyze the lenght of the conferences by week and weekday.

In [8]:
df_1 = get_daily_lengths(df)

In [9]:
cat_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
chart = (
    alt.Chart(df_1)
    .mark_rect(stroke='white')
    .encode(
        x=alt.X("day_of_week:N", title=" ", sort=cat_order, axis=alt.Axis(labelAngle=360)),
        y=alt.Y("yearweek:N", title="Week (Year)", sort=None),
        color=alt.Color(
            "words:Q",
            scale=alt.Scale(range=["#E1CBD3", color_president]),
            title="Words per day"
        ),
        tooltip=["date:T", "words:Q", "yearweek:N"]
    )
    .properties(
        title="Length of Claudia Sheinbaum’s Conferences (Words per Day)",
        width=600,
        height=900
    )
)
chart


### 2. Conference length by weekday
The goal of this visualization is to analyze the lenght of the conferences by week and weekday. This visualization allows to distinguish by days with multiple conferences.

In [10]:
# axis labels for the quantitative x. We need to create custom labels since we 
# are using a quantitative scale to create fixed-width day cells.
tick_vals = list(range(7))
label_expr = "['Mon','Tue','Wed','Thu','Fri','Sat','Sun'][datum.value]"

chart = (
    alt.Chart(df_1)
    .mark_rect(stroke='white')
    .encode(
        x=alt.X('x0:Q', # dummy variable for fixed day cell width
                title=' ',
                scale=alt.Scale(domain=[-0.001, 6.999]),  # exactly 7 day cells
                axis=alt.Axis(values=tick_vals, labelExpr=label_expr)),
        x2='x1:Q',  # dummy variable for fixed day cell width
        y=alt.Y("yearweek:N", title="Week (Year)", sort=None),
        color=alt.Color('words:Q',
                        title='Words per conference',
                        scale=alt.Scale(range=["#E1CBD3", color_president])),
        tooltip=[
            alt.Tooltip('date:T', title='Date'),
            alt.Tooltip('title:N', title='Title'),
            alt.Tooltip('words:Q', title='Word count', format=','),
            alt.Tooltip('n_conf:Q', title='# conferences that day')
        ]
    )
    .properties(
        title="Length of Claudia Sheinbaum’s Conferences (Words per Day)",
        width=600,
        height=900
    )
)
chart

### 3. Top speakers at the conferences
I aim to visualize who are the top speakers during the Mañaneras - This is by the number of interventions

In [11]:
df_3 = get_top_speakers(df, 10)
chart_top_speakers = (
    alt.Chart(df_3)
    .mark_bar(color=color_journalist, cornerRadius=3)
    .encode(
        x=alt.X("count:Q", title="Number of Interventions"),
        y=alt.Y("speaker:N", sort="-x", title="Speaker"),
        tooltip=[
            alt.Tooltip("speaker:N", title="Speaker"),
            alt.Tooltip("count:Q", title="Count", format=","),
            alt.Tooltip("pct_of_total:Q", title="% of Total", format=".1%")
        ]
    )
    .properties(
        title="Top 10 Speakers by Number of Interventions",
        width=800,
        height=400
    )
)
labels = (
    alt.Chart(df_3)
    .mark_text(align="left", dx=5, color="#333")
    .encode(
        x="count:Q",
        y=alt.Y("speaker:N", sort="-x"),
        text=alt.Text("count:Q", format=",")
    )
)

chart_top_speakers + labels

### 4. Top speakers at the conferences
I aim to visualize who are the top speakers during the Mañaneras - This is by the length and occurence of their interventions

In [12]:
df_4 = get_top_speakers_by_words(df, 10)

In [13]:
chart_top_speakers = (
    alt.Chart(df_4)
    .mark_bar(color=color_journalist, cornerRadius=3)
    .encode(
        x=alt.X("total_words:Q", title="Number of words spoken"),
        y=alt.Y("speaker_clean:N", sort="-x", title="Speaker"),
        tooltip=[
            alt.Tooltip("speaker_clean:N", title="Speaker"),
            alt.Tooltip("total_words:Q", title="Number of words", format=","),
            alt.Tooltip("pct_of_total:Q", title="% of Total", format=".1%")
        ]
    )
    .properties(
        title="Top 10 Speakers by Number of Words",
        width=800,
        height=400
    )
)
labels = (
    alt.Chart(df_4)
    .mark_text(align="left", dx=5, color="#333")
    .encode(
        x="total_words:Q",
        y=alt.Y("speaker_clean:N", sort="-x"),
        text=alt.Text("total_words:Q", format=",")
    )
)

chart_top_speakers + labels

### 5. Turn-taking structure of conferences by time. 
I want to explore how much the president lets journalists speak.

In [14]:
df_5 = get_turn_taking_stats(df)
df_5["date"] = pd.to_datetime(df_5["date"], errors="coerce")
df_5["ratio_smooth"] = df_5["ratio_president_journalist"].rolling(7, min_periods=1).mean()


In [15]:
chart_ratio = (
    alt.Chart(df_5)
    .mark_line(point=False, color=color_more)
    .encode(
        x=alt.X("date:T", title=" "),
        y=alt.Y("ratio_president_journalist:Q",
                title="President / Journalist Turn Ratio"),
        tooltip=[
            alt.Tooltip("date:T", title="Date"),
            alt.Tooltip("president_turns:Q", title="President Turns"),
            alt.Tooltip("journalist_turns:Q", title="Journalist Turns"),
            alt.Tooltip("ratio_president_journalist:Q", title="Ratio", format=".2f")
        ]
    )
    .properties(
        title="Balance of Interventions: President vs Journalists",
        width=1000,
        height=350
    )
)

chart_ratio_smooth = (
    alt.Chart(df_5)
    .mark_line(color=color_president, strokeWidth=3)
    .encode(
        x="date:T",
        y=alt.Y("ratio_smooth:Q", title="7-Day Average Ratio"),
        tooltip=["date:T", alt.Tooltip("ratio_smooth:Q", format=".2f")]
    )
    .properties(title="7-Day Rolling Average of President/Journalist Turn Ratio",
                width=1000, height=350)
)
chart_ratio + chart_ratio_smooth

### 6. Turn-taking structure of conferences by time. 
I want to explore how much the president lets journalists speak. This is taking percentages of total turns.

In [16]:
df_5["others_turns"] = (
    df_5["total_turns"]
    - df_5["president_turns"]
    - df_5["journalist_turns"]
)

# Convert to long format for Altair
turns_long = df_5.melt(
    id_vars=["date", "total_turns"],
    value_vars=["president_turns", "journalist_turns", "others_turns"],
    var_name="speaker_type",
    value_name="n_turns"
)

# Compute share (percentage)
turns_long["pct"] = turns_long["n_turns"] / turns_long["total_turns"]

turns_long["speaker_type"] = turns_long["speaker_type"].map({
    "president_turns": "Presidenta",
    "journalist_turns": "Journalists",
    "others_turns": "Others"
})

In [17]:
color_map = {
    "Presidenta": color_president,
    "Journalists": color_journalist,
    "Others": color_more
}

chart_turns_share = (
    alt.Chart(turns_long)
    .mark_bar()
    .encode(
        x=alt.X("date:T", title="Date"),
        y=alt.Y("pct:Q", title="Share of Speaking Turns", stack="normalize"),
        color=alt.Color(
            "speaker_type:N",
            title="Category",
            scale=alt.Scale(domain=list(color_map.keys()),
                            range=list(color_map.values()))
        ),
        tooltip=[
            alt.Tooltip("date:T", title="Date"),
            alt.Tooltip("speaker_type:N", title="Category"),
            alt.Tooltip("pct:Q", title="Share", format=".1%"),
            alt.Tooltip("n_turns:Q", title="Turns")
        ]
    )
    .properties(
        title="Distribution of Speaking Turns by Category Over Time",
        width=1000,
        height=350
    )
)
chart_turns_share

In [18]:
# Compute weekly share to avoid a lot of noise in daily data
weekly_share = (
    turns_long.assign(week=turns_long["date"].dt.isocalendar().week,
                      year=turns_long["date"].dt.year)
    .groupby(["year", "week", "speaker_type"], as_index=False)
    .agg(pct=("pct", "mean"))
)

weekly_share["yearweek"] = (
    weekly_share["year"].astype(str) + "-" + weekly_share["week"].astype(str).str.zfill(2)
)

chart_weekly = (
    alt.Chart(weekly_share)
    .mark_bar()
    .encode(
        x=alt.X("yearweek:N", title="Week"),
        y=alt.Y("pct:Q", title="Share of Turns", stack="normalize"),
        color=alt.Color("speaker_type:N",
                        title="Category",
                        scale=alt.Scale(domain=list(color_map.keys()),
                                        range=list(color_map.values()))),
        tooltip=["yearweek", "speaker_type:N", alt.Tooltip("pct:Q", format=".1%")]
    )
    .properties(title="Weekly Distribution of Speaking Turns by Category", width=1000, height=350)
)
chart_weekly

### 7. Average length by weekday
I am also interested in learning if conferences tend to be longer in some weekdays than others.

In [20]:
df_7 = get_avg_length_by_weekday(df)

In [21]:
day_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
overall_mean = df_7["avg_words"].mean()

chart_weekday_avg = (
    alt.Chart(df_7)
    .mark_bar(color=color_more, cornerRadius=3)
    .encode(
        x=alt.X("weekday:N", title=" ", sort=day_order, axis=alt.Axis(labelAngle=360)),
        y=alt.Y("avg_words:Q", title="Average Number of Words"),
        tooltip=[
            alt.Tooltip("weekday:N", title="Weekday"),
            alt.Tooltip("avg_words:Q", title="Avg. Words", format=",.0f")
        ]
    )
    .properties(
        title="Average Length of Claudia Sheinbaum’s Conferences by Weekday",
        width=600,
        height=400
    )
)

# Add horizontal mean line
mean_line = (
    alt.Chart(pd.DataFrame({"y": [overall_mean]}))
    .mark_rule(color=color_gray, strokeDash=[4, 2])
    .encode(y="y:Q")
)

mean_label = (
    alt.Chart(pd.DataFrame({"y": [overall_mean]}))
    .mark_text(
        text=f"Overall mean ({overall_mean:,.0f} words)",
        align="left",
        dx=8,
        dy=-5,
        color=color_gray
    )
    .encode(y="y:Q")
)

chart_weekday_avg + mean_line + mean_label


### 8. Topic occurence by week
I also want to learn how certain topics are covered by week.

In [22]:
# Define topics and associated keywords
topics = {
    "Educación": [
        "escuela", "escuelas", "maestro", "maestros", "profesor", "profesores",
        "educacion", "educativo", "educativa", "estudiante", "estudiantes",
        "alumno", "alumnos", "universidad", "universidades", "colegio", "colegios",
        "campus", "beca", "becas", "formacion", "ensenanza", "aprendizaje",
        "docente", "docentes", "conalep", "tecnologico", "ipn", "unam", "politecnico"
    ],
    
    "Migración": [
        "migrante", "migrantes", "migracion", "inmigrante", "inmigrantes",
        "frontera", "fronteras", "caravana", "caravanas", "asilo", "refugio",
        "refugiado", "refugiados", "deportacion", "deportado", "regularizacion",
        "estados unidos", "eeuu", "ee uu", "mexico-estados unidos", "cruce", "cruzar",
        "movilidad humana", "centroamerica", "venezolano", "haitiano"
    ],
    
    "Pobreza": [
        "pobreza", "pobre", "pobres", "desigualdad", "marginalidad", "carencia",
        "ingreso", "ingresos", "salario", "salarios", "empleo", "trabajo", "trabajadores",
        "bienestar", "ayuda", "ayudas", "subsidio", "subsidios", "transferencia", "transferencias",
        "programa social", "programas sociales", "prospera", "oportunidades",
        "pensiones", "pension", "adultos mayores", "familias", "hogares", "comunidad",
        "economia popular"
    ],
    
    "Salud": [
        "salud", "hospital", "hospitales", "clinica", "clinicas", "centro de salud",
        "imss", "issste", "insabi", "medico", "medicos", "doctor", "doctora",
        "enfermero", "enfermera", "enfermeras", "vacuna", "vacunas", "covid", "covid19",
        "pandemia", "enfermedad", "enfermedades", "atencion medica", "servicios medicos",
        "salubridad", "medicamento", "medicamentos", "prevencion", "campana de vacunacion"
    ],
    
    "Seguridad": [
        "seguridad", "violencia", "delincuencia", "delincuente", "delito", "delitos",
        "crimen", "criminal", "criminales", "policia", "policias", "guardia nacional",
        "gn", "ejercito", "militar", "militares", "marina", "sedena", "defensa", "homicidio",
        "feminicidio", "narco", "narcotrafico", "cartel", "carteles", "armas", "combate",
        "operativo", "detencion", "captura", "seguridad publica"
    ],
    
    "Medio Ambiente": [
        "medio ambiente", "ambiente", "ecologia", "ecologico", "sustentable", "sostenible",
        "agua", "rio", "rios", "laguna", "lagunas", "bosque", "bosques", "selva", "selvas",
        "deforestacion", "reforestacion", "energia", "energias", "renovable", "solar", "eolica",
        "clima", "climatico", "cambio climatico", "contaminacion", "reciclaje", "naturaleza",
        "biodiversidad", "animales", "flora", "fauna", "aire limpio", "medioambiental"
    ]
}


df_8 = get_topics_by_week(df, topics)

In [23]:
topic_colors = {
    "Educación": color_president,      
    "Migración": color_journalist,       
    "Pobreza": color_officials,     
    "Salud": color_others,      
    "Seguridad": color_more,       
    "Medio Ambiente": "#E76F51"
}

chart_weekly_topics_smooth = (
    alt.Chart(df_8)
    .transform_calculate(share_pct="datum.share_smooth * 100")
    .mark_line(strokeWidth=3)
    .encode(
        x=alt.X("yearweek:N", title="Week"),
        y=alt.Y("share_pct:Q", title="Share of Speech (%)"),
        color=alt.Color(
            "topic:N",
            title="Topic",
            scale=alt.Scale(
                domain=list(topic_colors.keys()),
                range=list(topic_colors.values())
            ),
            legend=alt.Legend(
                title="Topic",
                symbolStrokeWidth=4,  
                symbolSize=150,       
                labelFontSize=13,
                titleFontSize=14
            )
        ),
        tooltip=[
            alt.Tooltip("yearweek:N", title="Week"),
            alt.Tooltip("topic:N", title="Topic"),
            alt.Tooltip("share_pct:Q", title="Share (%)", format=".1f")
        ]
    )
    .properties(
        title="Smoothed Weekly Topic Trends in Mañaneras",
        width=1200,
        height=400
    )
)
chart_weekly_topics_smooth


### 9. What states in Mexico are mentioned the most?

In [24]:
df_9 = count_state_mentions(df)
chart_states = (
    alt.Chart(df_9)
    .mark_bar(color=color_more, cornerRadius=3)
    .encode(
        x=alt.X("mentions:Q", title="Number of Mentions"),
        y=alt.Y("state:N", sort="-x", title="State"),
        tooltip=["state:N", "mentions:Q"]
    )
    .properties(
        title="Most Mentioned Mexican States in Mañaneras",
        width=600,
        height=400
    )
)
chart_states


In [25]:
# Exclude "Estado de Mexico" from the chart because it skews the visualization.
# I still need to figure out why it is being counted so much more than other states.
df_9 = df_9[df_9['state'] != "Estado de Mexico"]  # Exclude "Estado de Mexico"
chart_states = (
    alt.Chart(df_9)
    .mark_bar(color=color_more, cornerRadius=3)
    .encode(
        x=alt.X("mentions:Q", title="Number of Mentions"),
        y=alt.Y("state:N", sort="-x", title="State"),
        tooltip=["state:N", "mentions:Q"]
    )
    .properties(
        title="Most Mentioned Mexican States in Mañaneras",
        width=600,
        height=400
    )
)
chart_states

In [26]:
# Mexican map visualization -- Not sure if I like it
geo_url = "https://raw.githubusercontent.com/angelnmara/geojson/master/mexicoHigh.json"
mexico_gdf = gpd.read_file(geo_url)

mexico_gdf["state_norm"] = mexico_gdf["name"].apply(normalize_name)

df_states = df_9.copy()
df_states["state_norm"] = df_states["state"].apply(normalize_name)

merged = mexico_gdf.merge(df_states, on="state_norm", how="left").fillna({"mentions": 0})

chart_map = (
    alt.Chart(alt.Data(values=merged.__geo_interface__["features"]))
    .mark_geoshape(stroke="white", strokeWidth=0.5)
    .encode(
        color=alt.Color(
            "properties.mentions:Q",
            scale=alt.Scale(range=["#E1CBD3", color_president]),
            title="Presidential Mentions"
        ),
        tooltip=[
            alt.Tooltip("properties.state:N", title="State"),
            alt.Tooltip("properties.mentions:Q", title="Presidential Mentions")
        ]
    )
    .properties(
        title="Mentions of Mexican States by the President",
        width=600,
        height=400
    )
    .project("mercator")
)
chart_map.configure_view(stroke=None)
