<a href="https://colab.research.google.com/github/nash635/nash635/blob/master/notebooks/AlphaFold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib import rcParams
import numpy as np

# Configure Chinese font support
rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'Microsoft YaHei', 'DejaVu Sans']
rcParams['axes.unicode_minus'] = False

# Updated model data (corrected based on search results)
# Format: (Name, Year, Month, Total Params in B, Active Params in B, Technical Features)
models = [
    ("GPT-3", 2020, 6, 175, 175, "Dense Transformer"),
    ("PaLM", 2022, 4, 540, 540, "Pathways architecture"),
    ("GPT-4", 2023, 3, 1760, 1760, "Multimodal, MoE (estimated)"),
    ("LLaMA-2", 2023, 7, 70, 70, "Efficient pretraining, open source"),
    ("Claude 3", 2024, 3, 200, 200, "Constitutional AI (estimated)"),
    ("Gemini 1.5", 2024, 2, 1560, 1560, "Multimodal, 1M context (estimated)"),
    ("GPT-4o", 2024, 5, 200, 200, "Unified multimodal (estimated)"),
    ("Qwen2", 2024, 6, 72, 72, "Multilingual, scalable"),
    ("DeepSeek-V3", 2024, 12, 671, 37, "MoE, efficient training"),
    ("Qwen2.5", 2025, 1, 72, 72, "Long context + instruction tuning"),
    ("Kimi k1.5", 2025, 2, 456, 28, "RL scaling, long2short CoT"),
    ("MiniMax-01", 2025, 2, 456, 28, "Lightning Attention, 4M context"),
    ("Ming-Omni", 2025, 6, 456, 28, "Unified multimodal, MoE (estimated)"),
    ("Kimi K2", 2025, 7, 1000, 32, "MoE, Muon optimizer, agentic AI"),
    ("Ming-lite-omni v1.5", 2025, 7, 20, 3, "Enhanced video, speech synthesis"),
    ("DeepSeek-V3.1", 2025, 8, 671, 37, "Hybrid thinking mode, 128K context"),
    ("DeepSeek-V3.2-Exp", 2025, 9, 671, 37, "DSA sparse attention, 50% cost cut"),
    ("Ling-1T", 2025, 10, 1000, 100, "FP8 training, Evo-CoT (estimated)"),
    ("MiniMax M2", 2025, 10, 230, 10, "MoE, agent+code native, 2x faster"),
]

# Extract data
years = [m[1] for m in models]
months = [m[2] for m in models]
names = [m[0] for m in models]
total_params = [m[3] for m in models]
active_params = [m[4] for m in models]
features = [m[5] for m in models]

# Calculate exact time positions (year + month/12)
time_positions = [y + (m-1)/12 for y, m in zip(years, months)]

# Assign colors based on release time
colors = []
for y, m in zip(years, months):
    if y < 2025:
        colors.append('#1f77b4')  # Blue: 2020-2024
    elif m <= 5:
        colors.append('#d62728')  # Red: 2025 Q1-Q2 early
    elif 6 <= m <= 8:
        colors.append('#ff7f0e')  # Orange: 2025 mid-year (Jun-Aug)
    else:
        colors.append('#2ca02c')  # Green: 2025 H2 (Sep-Oct)

# Create figure with larger canvas
fig, ax = plt.subplots(figsize=(20, 14))

# Use log scale for y-axis (parameter size)
# Plot bubbles with size proportional to active parameters
bubble_sizes = [max(p * 3, 50) for p in active_params]  # Scale for visibility

scatter = ax.scatter(time_positions, total_params, s=bubble_sizes, c=colors,
                     edgecolors='black', alpha=0.7, zorder=3, linewidth=1.5)

# Add annotations (adjust position to avoid overlap)
for i, (name, time_pos, total_p, active_p, feat) in enumerate(zip(names, time_positions, total_params, active_params, features)):
    # Model name on the right side of point
    ax.text(time_pos + 0.08, total_p * 1.15, name,
            va='bottom', ha='left', fontsize=9.5, fontweight='bold')
    # Parameter info below name
    param_text = f"{total_p}B total, {active_p}B active" if total_p != active_p else f"{total_p}B params"
    ax.text(time_pos + 0.08, total_p * 0.92, param_text,
            va='top', ha='left', fontsize=8, color='#555555', style='italic')
    # Technical features below params
    ax.text(time_pos + 0.08, total_p * 0.75, feat,
            va='top', ha='left', fontsize=7.5, color='#666666')

# Add vertical grid lines
for year in range(2020, 2026):
    ax.axvline(x=year, color='gray', linestyle='--', alpha=0.3, linewidth=0.8)

# Set log scale for y-axis
ax.set_yscale('log')

# Formatting
ax.set_title("LLM Architecture Evolution Timeline (2020-2025)\nBubble Size = Active Parameters",
             fontsize=18, fontweight='bold', pad=20)
ax.set_xlabel("Timeline (Year-Month)", fontsize=14, fontweight='bold')
ax.set_ylabel("Total Parameters (Billions, log scale)", fontsize=14, fontweight='bold')
ax.set_xlim(2019.8, 2026)
ax.set_ylim(15, 2500)

# X-axis ticks
ax.set_xticks([2020, 2021, 2022, 2023, 2024, 2025, 2025.5, 2025.83])
ax.set_xticklabels(['2020', '2021', '2022', '2023', '2024', '2025 Q1-Q2', '2025 Jun-Aug', '2025 Sep-Oct'],
                   rotation=15, ha='right')

# Y-axis ticks
ax.set_yticks([20, 50, 100, 200, 500, 1000, 2000])
ax.set_yticklabels(['20B', '50B', '100B', '200B', '500B', '1T', '2T'])

ax.grid(True, axis='both', linestyle='--', alpha=0.3)

# Legend
legend_elements = [
    mpatches.Patch(color='#1f77b4', label='2020-2024'),
    mpatches.Patch(color='#d62728', label='2025 Q1-Q2 Early (Jan-May)'),
    mpatches.Patch(color='#ff7f0e', label='2025 Mid-Year (Jun-Aug)'),
    mpatches.Patch(color='#2ca02c', label='2025 H2 (Sep-Oct)')
]
ax.legend(handles=legend_elements, loc='upper left', fontsize=11)

# Add technical trends annotation
trend_text = """Key Technical Trends:
• MoE sparse activation becomes mainstream
• FP8 mixed-precision training breakthrough
• Inference efficiency optimization (RL+new optimizers)
• Long context capability increase (4M tokens)
• Lightning/DSA attention mechanisms
• Unified multimodal architecture (understand+generate)
• Agentic AI as new competitive arena
• Cost efficiency revolution (10B active ≈ 230B total)"""

ax.text(0.02, 0.98, trend_text, transform=ax.transAxes,
        fontsize=9.5, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.65),
        verticalalignment='top', linespacing=1.6)

plt.tight_layout()
plt.show()

print("\n=== Corrected 2025 LLM Technology Innovation Timeline ===\n")
print("【IMPORTANT CORRECTIONS】")
print("• DeepSeek-V3.1: August 2025 (not January)")
print("• DeepSeek-V3.2-Exp: September 29, 2025 (not March)")
print("• Kimi K2: July 11, 2025 ✓")
print("• MiniMax M2: October 27, 2025 ✓")

print("\n【Q1-Q2 Early: Jan-May】Foundation Enhancement")
print("1. Qwen2.5 (Jan): Long context + instruction tuning")
print("2. Kimi k1.5 (Feb): RL scaling breakthrough, long2short CoT")
print("3. MiniMax-01 (Feb): Lightning Attention, 4M ultra-long context")

print("\n【Mid-Year: Jun-Aug】Multimodal + Agentic Breakthrough")
print("4. Ming-Omni (Jun): Unified multimodal (perception+generation)")
print("5. Kimi K2 (Jul 11):")
print("   - 1T total params, 32B active MoE architecture")
print("   - MuonClip optimizer: ultra-large-scale training stability")
print("   - Designed for Agentic AI")
print("   - 15.5T tokens, zero loss spike")

print("6. Ming-lite-omni v1.5 (Jul): 20.3B total, 3B active")
print("7. DeepSeek-V3.1 (Aug 21):")
print("   - Hybrid thinking mode: thinking + non-thinking")
print("   - 128K context window")
print("   - Enhanced tool calling and agent capabilities")

print("\n【H2: Sep-Oct】Trillion-Parameter + Extreme Efficiency")
print("8. DeepSeek-V3.2-Exp (Sep 29):")
print("   - DSA (DeepSeek Sparse Attention)")
print("   - 50% cost reduction vs V3.1")
print("   - Experimental model for next-gen architecture")

print("9. Ling-1T (Oct 9): 1T params, FP8 training, Evo-CoT")
print("10. MiniMax M2 (Oct 27):")
print("    - MoE 230B total, only 10B active")
print("    - 2x speed of Claude Sonnet 4.5")
print("    - Cost only 8%")
print("    - 205K context window")
print("    - #1 open-source model on Artificial Analysis")

print("\n【Core Technology Evolution】")
print("✓ Jun: Multimodal unification (understand+generate)")
print("✓ Jul: Agentic AI becomes standard, new optimizers break training bottlenecks")
print("✓ Aug: Hybrid thinking modes mature")
print("✓ Sep-Oct: Trillion-parameter + FP8 training mature, revolutionary cost efficiency")
print("✓ Full Year: MoE sparse activation, long context, code gen continue to evolve")
print("✓ Trend: From dialogue to action, from uni-modal to omni-modal, from capability to efficiency")