In [1]:
import numpy as np
import pandas as pd

## 时间序列

### 时间戳

- pd.date_range()

    注意点：start, end, periods, freq
    
    这四个参数中，必须指定其中三个，另一个参数会根据指定的参数自动推导。
    
    如果省略 freq 参数，生成的日期范围会在 start 和 end 之间均匀分布，且包含 start 和 end。

- pd.period_range()

    注意点：start, end, periods
    
    这三个参数中，必须指定其中两个，另一个参数会根据指定的参数自动推导。
    

In [2]:
# 创建时间戳
pd.Timestamp('2025-03-20')  # 时刻数据
# freq：频率，显示到哪个维度，Y：显示年度，M：显示到月份，D：显示到天
pd.Period('2025-03-20', freq='Y')  # 时期数据

# 批量生成时刻数据（从开始时间到结束时间内生成时间数据）
# periods=4，表示生成4个时间数据
index = pd.date_range(start='2025-03-01',periods=4,freq='D')
index2 = pd.period_range(start='2025-3-01',end='2025-03-20',freq='D')
index2

# 时间戳索引
pd.Series(np.random.randint(0,10,size=4), index=index)

2025-03-01    3
2025-03-02    8
2025-03-03    3
2025-03-04    7
Freq: D, dtype: int32

In [3]:
# 转换方法
# 如果时间格式不统一，需要指定format='mixed'
pd.to_datetime(['2030.03.14','2030-3-14','14/03/2030','2030/3/14'],format='mixed')
# 时间戳 -> 时间
pd.to_datetime([1898675423],unit='s')
# 
dt = pd.to_datetime([1898675423000],unit='ms')
display(dt)
# 时间差：DateOffset()，默认在原来的时间添加一天，可以指定添加时长
dt + pd.DateOffset(hours=8) # +8小时
dt + pd.DateOffset(days=8) # +8天
dt - pd.DateOffset(hours=8) # -8小时
dt + pd.DateOffset(days=-8) # -8天

DatetimeIndex(['2030-03-02 09:50:23'], dtype='datetime64[ns]', freq=None)

DatetimeIndex(['2030-02-22 09:50:23'], dtype='datetime64[ns]', freq=None)

In [4]:
# 时间戳的索引和切片
index = pd.date_range('2030-03-14',periods=100,freq='D')
index

DatetimeIndex(['2030-03-14', '2030-03-15', '2030-03-16', '2030-03-17',
               '2030-03-18', '2030-03-19', '2030-03-20', '2030-03-21',
               '2030-03-22', '2030-03-23', '2030-03-24', '2030-03-25',
               '2030-03-26', '2030-03-27', '2030-03-28', '2030-03-29',
               '2030-03-30', '2030-03-31', '2030-04-01', '2030-04-02',
               '2030-04-03', '2030-04-04', '2030-04-05', '2030-04-06',
               '2030-04-07', '2030-04-08', '2030-04-09', '2030-04-10',
               '2030-04-11', '2030-04-12', '2030-04-13', '2030-04-14',
               '2030-04-15', '2030-04-16', '2030-04-17', '2030-04-18',
               '2030-04-19', '2030-04-20', '2030-04-21', '2030-04-22',
               '2030-04-23', '2030-04-24', '2030-04-25', '2030-04-26',
               '2030-04-27', '2030-04-28', '2030-04-29', '2030-04-30',
               '2030-05-01', '2030-05-02', '2030-05-03', '2030-05-04',
               '2030-05-05', '2030-05-06', '2030-05-07', '2030-05-08',
      

In [5]:
ts = pd.Series(range(len(index)),index=index)
ts

2030-03-14     0
2030-03-15     1
2030-03-16     2
2030-03-17     3
2030-03-18     4
              ..
2030-06-17    95
2030-06-18    96
2030-06-19    97
2030-06-20    98
2030-06-21    99
Freq: D, Length: 100, dtype: int64

In [47]:
# 索引
ts['2030-03-15']  # 获取某天对应的值
ts['2030-03']  # 获取3月份的所有值
ts['2030']  # 获取年度的所有值

2030-03-14     0
2030-03-15     1
2030-03-16     2
2030-03-17     3
2030-03-18     4
              ..
2030-06-17    95
2030-06-18    96
2030-06-19    97
2030-06-20    98
2030-06-21    99
Freq: D, Length: 100, dtype: int64

In [48]:
# 切片
ts['2030-03-15':'2030-03-22']

2030-03-15    1
2030-03-16    2
2030-03-17    3
2030-03-18    4
2030-03-19    5
2030-03-20    6
2030-03-21    7
2030-03-22    8
Freq: D, dtype: int64

In [51]:
# 时间戳索引
stamp = pd.Timestamp('2030-03-22')
display(stamp)
ts[stamp]
# 切片
ts[pd.Timestamp('2030-03-15'): pd.Timestamp('2030-03-22')]

Timestamp('2030-03-22 00:00:00')

2030-03-15    1
2030-03-16    2
2030-03-17    3
2030-03-18    4
2030-03-19    5
2030-03-20    6
2030-03-21    7
2030-03-22    8
Freq: D, dtype: int64

In [52]:
# 配合date_range()获取值
ts[pd.date_range('2030-03-24',periods=10,freq='D')]

2030-03-24    10
2030-03-25    11
2030-03-26    12
2030-03-27    13
2030-03-28    14
2030-03-29    15
2030-03-30    16
2030-03-31    17
2030-04-01    18
2030-04-02    19
Freq: D, dtype: int64

#### 常用属性

In [59]:
ts.index
ts.index.year
ts.index.month
ts.index.day
ts.index.dayofweek #星期几

Index([3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5,
       6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1,
       2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4,
       5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0,
       1, 2, 3, 4],
      dtype='int32')

#### 时间序列常用方法

- 对时间做一些移动/滞后、频率转换、采样等相关操作

In [6]:
index = pd.date_range('2030-3-1',periods=365,freq='D')
ts = pd.Series(np.random.randint(0,500,size=len(index)),index=index)
ts

2030-03-01    202
2030-03-02    331
2030-03-03    206
2030-03-04      5
2030-03-05    283
             ... 
2031-02-24    437
2031-02-25    325
2031-02-26    479
2031-02-27     34
2031-02-28    386
Freq: D, Length: 365, dtype: int32

In [8]:
# 移动
ts.shift() # 默认后移一位
ts.shift(periods=2) # 后移2位
ts.shift(periods=-2) # 前移2位

2030-03-01    206.0
2030-03-02      5.0
2030-03-03    283.0
2030-03-04    451.0
2030-03-05    346.0
              ...  
2031-02-24    479.0
2031-02-25     34.0
2031-02-26    386.0
2031-02-27      NaN
2031-02-28      NaN
Freq: D, Length: 365, dtype: float64

In [11]:
# 频率转换
ts.asfreq(pd.tseries.offsets.Week()) # 将ts的频率转换为星期
ts.asfreq(pd.tseries.offsets.MonthEnd()) # 将ts的频率转换为月的最后一天
# 数据由少变多，默认非00:00:00的数据为Nan，可以使用fill_value指定填充数据
ts.asfreq(pd.tseries.offsets.Hour(), fill_value=0) # 将ts的频率转换为小时

2030-03-01 00:00:00    202
2030-03-01 01:00:00      0
2030-03-01 02:00:00      0
2030-03-01 03:00:00      0
2030-03-01 04:00:00      0
                      ... 
2031-02-27 20:00:00      0
2031-02-27 21:00:00      0
2031-02-27 22:00:00      0
2031-02-27 23:00:00      0
2031-02-28 00:00:00    386
Freq: h, Length: 8737, dtype: int32

#### resample：根据日期维度进行数据聚合
- 按照秒(s)、分钟（min）、小时（h）、日（D）、周（W）、月（ME）、年（YE）等维度来作为日期维度

In [12]:
# 重采样
ts

2030-03-01    202
2030-03-02    331
2030-03-03    206
2030-03-04      5
2030-03-05    283
             ... 
2031-02-24    437
2031-02-25    325
2031-02-26    479
2031-02-27     34
2031-02-28    386
Freq: D, Length: 365, dtype: int32

In [28]:
display(ts.resample('D'))
ts.resample('2D').sum() # 以2天为单位进行聚合
ts.resample('W').sum() # 以星期为单位进行聚合
ts.resample('3ME').sum() # 以3个月（季度）为单位进行聚合
ts.resample('min').sum() # 以分钟为单位进行聚合
ts.resample('h').sum() # 以小时为单位进行聚合
ts.resample('YE').sum() # 以小时为单位进行聚合
ts.resample('s').sum() # 以小时为单位进行聚合

<pandas.core.resample.DatetimeIndexResampler object at 0x000002B308CA21B0>

2030-03-01 00:00:00    202
2030-03-01 00:00:01      0
2030-03-01 00:00:02      0
2030-03-01 00:00:03      0
2030-03-01 00:00:04      0
                      ... 
2031-02-27 23:59:56      0
2031-02-27 23:59:57      0
2031-02-27 23:59:58      0
2031-02-27 23:59:59      0
2031-02-28 00:00:00    386
Freq: s, Length: 31449601, dtype: int32

In [30]:
# DataFrame重采样
d = {
    'price':[10,11,2,44,33,44,55,66],
    'score':[40,30,20,50,60,70,80,10],
    'week':pd.date_range('2030-3-1',periods=8,freq='W')
}
df = pd.DataFrame(data=d)
df

Unnamed: 0,price,score,week
0,10,40,2030-03-03
1,11,30,2030-03-10
2,2,20,2030-03-17
3,44,50,2030-03-24
4,33,60,2030-03-31
5,44,70,2030-04-07
6,55,80,2030-04-14
7,66,10,2030-04-21


In [38]:
# 对week列进行按月汇总求和
display(df.resample('ME',on='week'))
df.resample('ME',on='week').sum()
df.resample('ME',on='week').apply("sum")
# 对week列进行按月汇总求和：对price求平均值，对score求和
df.resample('ME',on='week').agg({'price':'mean','score':'sum'})

<pandas.core.resample.DatetimeIndexResampler object at 0x000002B31A90C9E0>

Unnamed: 0_level_0,price,score
week,Unnamed: 1_level_1,Unnamed: 2_level_1
2030-03-31,20.0,200
2030-04-30,55.0,160


### 时区

In [41]:
index = pd.date_range('2030-3-1 00:00',periods=3,freq='D')
# randn：正态分布
ts = pd.Series(np.random.randn(len(index)),index=index)
ts

2030-03-01    0.082736
2030-03-02    0.607451
2030-03-03   -0.261791
Freq: D, dtype: float64

In [42]:
# 导包：tz：timezone（时区）
import pytz

In [44]:
# 常见时区
pytz.common_timezones

['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
 'Africa/Asmara',
 'Africa/Bamako',
 'Africa/Bangui',
 'Africa/Banjul',
 'Africa/Bissau',
 'Africa/Blantyre',
 'Africa/Brazzaville',
 'Africa/Bujumbura',
 'Africa/Cairo',
 'Africa/Casablanca',
 'Africa/Ceuta',
 'Africa/Conakry',
 'Africa/Dakar',
 'Africa/Dar_es_Salaam',
 'Africa/Djibouti',
 'Africa/Douala',
 'Africa/El_Aaiun',
 'Africa/Freetown',
 'Africa/Gaborone',
 'Africa/Harare',
 'Africa/Johannesburg',
 'Africa/Juba',
 'Africa/Kampala',
 'Africa/Khartoum',
 'Africa/Kigali',
 'Africa/Kinshasa',
 'Africa/Lagos',
 'Africa/Libreville',
 'Africa/Lome',
 'Africa/Luanda',
 'Africa/Lubumbashi',
 'Africa/Lusaka',
 'Africa/Malabo',
 'Africa/Maputo',
 'Africa/Maseru',
 'Africa/Mbabane',
 'Africa/Mogadishu',
 'Africa/Monrovia',
 'Africa/Nairobi',
 'Africa/Ndjamena',
 'Africa/Niamey',
 'Africa/Nouakchott',
 'Africa/Ouagadougou',
 'Africa/Porto-Novo',
 'Africa/Sao_Tome',
 'Africa/Tripoli',
 'Africa/Tunis',
 'Africa/Wi

In [45]:
# 时区表式
tstz = ts.tz_localize(tz='UTC')
tstz

2030-03-01 00:00:00+00:00    0.082736
2030-03-02 00:00:00+00:00    0.607451
2030-03-03 00:00:00+00:00   -0.261791
Freq: D, dtype: float64

In [46]:
# 时区转换
tstz.tz_convert(tz='Asia/Shanghai')

2030-03-01 08:00:00+08:00    0.082736
2030-03-02 08:00:00+08:00    0.607451
2030-03-03 08:00:00+08:00   -0.261791
Freq: D, dtype: float64