# 01 - 数据探索 (Data Exploration)

本 Notebook 用于探索和分析掘金量化平台的行情数据。

内容包括：
1. 数据获取与基础统计
2. 价格走势可视化
3. 收益率分布分析
4. 波动率分析
5. 相关性分析

## 1. 环境准备

In [None]:
# 导入必要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

# 设置中文显示
matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei']
matplotlib.rcParams['axes.unicode_minus'] = False

# 设置图表样式
plt.style.use('seaborn-v0_8-whitegrid')

# 掘金 SDK
from gm.api import *

print("Environment ready!")

In [None]:
# 设置 Token (替换为你的 token)
TOKEN = 'your_token_here'
set_token(TOKEN)

print("Token set successfully!")

## 2. 数据获取

In [None]:
# 获取单只股票数据
symbol = 'SHSE.600000'  # 浦发银行
start_date = '2023-01-01'
end_date = '2023-12-31'

df = history(
    symbol=symbol,
    frequency='1d',
    start_time=start_date,
    end_time=end_date,
    fields='open,high,low,close,volume,amount',
    adjust=ADJUST_PREV,
    df=True
)

print(f"获取 {symbol} 数据: {len(df)} 条")
df.head()

In [None]:
# 数据基础统计
print("=" * 50)
print("数据基础信息")
print("=" * 50)
print(f"\n时间范围: {df.index[0]} 至 {df.index[-1]}")
print(f"交易日数: {len(df)}")
print(f"\n价格范围:")
print(f"  最高价: {df['high'].max():.2f}")
print(f"  最低价: {df['low'].min():.2f}")
print(f"  起始价: {df['close'].iloc[0]:.2f}")
print(f"  结束价: {df['close'].iloc[-1]:.2f}")
print(f"\n区间收益: {(df['close'].iloc[-1]/df['close'].iloc[0] - 1)*100:.2f}%")
print(f"\n日均成交量: {df['volume'].mean():,.0f}")
print(f"日均成交额: {df['amount'].mean():,.0f}")

## 3. 价格走势可视化

In [None]:
# 绘制价格走势图
fig, axes = plt.subplots(2, 1, figsize=(14, 8), sharex=True,
                         gridspec_kw={'height_ratios': [3, 1]})

# 价格
ax1 = axes[0]
ax1.plot(df.index, df['close'], label='收盘价', color='blue', linewidth=1.5)
ax1.fill_between(df.index, df['low'], df['high'], alpha=0.2, color='blue')
ax1.set_ylabel('价格', fontsize=12)
ax1.set_title(f'{symbol} 价格走势 ({start_date} ~ {end_date})', fontsize=14)
ax1.legend(loc='upper left')
ax1.grid(True, alpha=0.3)

# 成交量
ax2 = axes[1]
colors = ['red' if df['close'].iloc[i] >= df['open'].iloc[i] else 'green' 
          for i in range(len(df))]
ax2.bar(df.index, df['volume'], color=colors, alpha=0.7)
ax2.set_ylabel('成交量', fontsize=12)
ax2.set_xlabel('日期', fontsize=12)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. 收益率分析

In [None]:
# 计算收益率
df['returns'] = df['close'].pct_change()
df['log_returns'] = np.log(df['close'] / df['close'].shift(1))

# 收益率统计
returns = df['returns'].dropna()

print("=" * 50)
print("收益率统计")
print("=" * 50)
print(f"\n日均收益率: {returns.mean()*100:.4f}%")
print(f"日收益率标准差: {returns.std()*100:.4f}%")
print(f"年化收益率: {returns.mean()*252*100:.2f}%")
print(f"年化波动率: {returns.std()*np.sqrt(252)*100:.2f}%")
print(f"\n最大日涨幅: {returns.max()*100:.2f}%")
print(f"最大日跌幅: {returns.min()*100:.2f}%")
print(f"\n偏度: {returns.skew():.4f}")
print(f"峰度: {returns.kurtosis():.4f}")

In [None]:
# 收益率分布图
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 直方图
ax1 = axes[0]
ax1.hist(returns, bins=50, color='steelblue', edgecolor='white', alpha=0.7, density=True)

# 正态分布曲线
from scipy import stats
x = np.linspace(returns.min(), returns.max(), 100)
ax1.plot(x, stats.norm.pdf(x, returns.mean(), returns.std()), 
         'r-', linewidth=2, label='正态分布')
ax1.axvline(x=0, color='black', linestyle='--', alpha=0.5)
ax1.set_xlabel('日收益率', fontsize=12)
ax1.set_ylabel('密度', fontsize=12)
ax1.set_title('收益率分布', fontsize=14)
ax1.legend()

# Q-Q 图
ax2 = axes[1]
stats.probplot(returns, dist="norm", plot=ax2)
ax2.set_title('Q-Q 图', fontsize=14)

plt.tight_layout()
plt.show()

## 5. 波动率分析

In [None]:
# 计算滚动波动率
df['volatility_20'] = df['returns'].rolling(window=20).std() * np.sqrt(252)
df['volatility_60'] = df['returns'].rolling(window=60).std() * np.sqrt(252)

# 绘制波动率
fig, ax = plt.subplots(figsize=(14, 5))

ax.plot(df.index, df['volatility_20'], label='20日波动率', color='blue')
ax.plot(df.index, df['volatility_60'], label='60日波动率', color='red')
ax.axhline(y=df['volatility_20'].mean(), color='blue', linestyle='--', alpha=0.5)
ax.axhline(y=df['volatility_60'].mean(), color='red', linestyle='--', alpha=0.5)

ax.set_ylabel('年化波动率', fontsize=12)
ax.set_xlabel('日期', fontsize=12)
ax.set_title('滚动波动率', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"平均20日波动率: {df['volatility_20'].mean()*100:.2f}%")
print(f"平均60日波动率: {df['volatility_60'].mean()*100:.2f}%")

## 6. 多股票对比分析

In [None]:
# 获取多只股票数据
symbols = [
    'SHSE.600000',  # 浦发银行
    'SHSE.600036',  # 招商银行
    'SHSE.601318',  # 中国平安
]

dfs = {}
for sym in symbols:
    data = history(
        symbol=sym,
        frequency='1d',
        start_time=start_date,
        end_time=end_date,
        fields='close',
        adjust=ADJUST_PREV,
        df=True
    )
    if data is not None and len(data) > 0:
        dfs[sym] = data['close']
        print(f"获取 {sym}: {len(data)} 条")

# 合并数据
combined = pd.DataFrame(dfs)
combined.head()

In [None]:
# 归一化价格对比
normalized = combined / combined.iloc[0] * 100

fig, ax = plt.subplots(figsize=(14, 6))

for col in normalized.columns:
    ax.plot(normalized.index, normalized[col], label=col, linewidth=1.5)

ax.axhline(y=100, color='black', linestyle='--', alpha=0.5)
ax.set_ylabel('归一化价格 (起点=100)', fontsize=12)
ax.set_xlabel('日期', fontsize=12)
ax.set_title('多股票价格走势对比', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# 相关性分析
returns_df = combined.pct_change().dropna()
corr_matrix = returns_df.corr()

print("\n相关性矩阵:")
print(corr_matrix.round(4))

# 热力图
fig, ax = plt.subplots(figsize=(8, 6))
im = ax.imshow(corr_matrix, cmap='RdYlGn', vmin=-1, vmax=1)

ax.set_xticks(range(len(corr_matrix.columns)))
ax.set_yticks(range(len(corr_matrix.columns)))
ax.set_xticklabels([s.split('.')[1] for s in corr_matrix.columns], rotation=45)
ax.set_yticklabels([s.split('.')[1] for s in corr_matrix.columns])

# 添加数值标签
for i in range(len(corr_matrix)):
    for j in range(len(corr_matrix)):
        text = ax.text(j, i, f'{corr_matrix.iloc[i, j]:.2f}',
                       ha='center', va='center', color='black')

ax.set_title('收益率相关性热力图', fontsize=14)
plt.colorbar(im)
plt.tight_layout()
plt.show()

## 7. 小结

本 Notebook 展示了基础的数据探索方法：

1. **数据获取**: 使用 `history()` 函数获取历史行情数据
2. **基础统计**: 价格范围、收益率、成交量等
3. **可视化**: 价格走势、成交量、收益率分布
4. **波动率分析**: 滚动波动率计算和可视化
5. **多股票分析**: 价格对比、相关性矩阵

**下一步**: 前往 `02_indicator_lab.ipynb` 学习技术指标的使用。