在该notebook中，我们定义一些函数，搭建一些OOP架构，以及完成一些数据预处理。

In [None]:
import numpy as np
import pandas as pd
import scipy as sp
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import pmdarima as pm
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier

pd.options.mode.chained_assignment = None

In [None]:
def XOX(s, p):
    '''
    将绝对量数据转化为增长率
    s : array
    p : look-back period
    '''
    return np.append(np.repeat(np.NAN, p), (s[p:] - s[:-p])/s[:-p])

def YOY(s):
    '''
    计算月度同比增长率
    s : 月度数据array
    '''
    return XOX(s, 12)

def MOM(s):
    '''
    计算月度环比增长率
    s : 月度数据array
    '''
    return XOX(s, 1)

def DOD(s):
    '''
    计算日度增长率
    s : 日度数据array
    '''
    return XOX(s, 1)

In [None]:
def lagging(s, l):
    '''
    向后平移时间序列 ( e.g. [1,2,3] -> [NA,1,2] )
    s : array
    l : lagging period
    '''
    return np.append(np.repeat(np.NAN, l), s[:-l])

In [None]:
def to_pm(s):
    '''
    提取序列中的正负号 ( e.g. [-3, 2, -2, 3] -> [-1, 1, -1, 1] )
    s : array
    '''
    s_pm = np.zeros(s.shape)
    for i in range(len(s)):
        if s[i] > 0:
            s_pm[i] = 1
        if s[i] < 0:
            s_pm[i] = -1
    return s_pm

In [None]:
def pearson(s1, s2):
    '''
    计算s1和s2的Pearson相关性
    '''
    nas = np.logical_or(np.isnan(s1), np.isnan(s2))
    return sp.stats.pearsonr(s1[~nas], s2[~nas])[0]

In [None]:
def HP(series, lamb = 1000):
    '''
    Hodrick-Prescoot filter
    返回滤波平滑后的序列
    '''
    cycle_sm, trend_sm = sm.tsa.filters.hpfilter(series.astype(float).interpolate(), lamb = lamb)
    return trend_sm

In [None]:
quotation = pd.read_pickle('fut_quotation.dat')  # 读取商品数据文档

class CommodityQuotation:
    
    '''
    商品相关操作
    '''
    
    def __init__(self, code, year_range = None):
        
        '''
        code : 商品名称缩写 ( e.g. "RB" )
        year_range : 研究的年份区间 ( e.g. [2011, 2021] )
        '''
        
        self.code = code
        
        # self.daily : 日度 DataFrame
        
        self.daily = quotation[quotation.product_code == code]
        self.daily['year'] = self.daily['trade_date'].dt.year
        if year_range != None:
            self.daily = self.daily[(self.daily['year'] >= year_range[0]) 
                                    & (self.daily['year'] <= year_range[1])]
        self.daily = self.daily.drop(columns=['pre_close', 'oi', 'oi_chg', 
                                            'amount', 'product_code'])
        self.daily.rename(columns={'open':'Open', 'close':'Close', 
                                  'high':'High', 'low':'Low', 'vol':'Volume'}, 
                         inplace=True)
        self.daily['Volume'] = self.daily.Volume.astype(int)
        self.daily.index = range(len(self.daily.index))
        
        # self.monthly : 月度 DataFrame
        
        self.monthly = self.daily.copy()
        self.monthly['Open_filter'] = HP(self.daily.Open)
        self.monthly['day'] = self.monthly['trade_date'].dt.day
        self.monthly['month'] = self.monthly['trade_date'].dt.month
        self.monthly = self.monthly.groupby(['year', 'month']).last().reset_index()  # 平滑后取每月最后一天
        self.monthly = self.monthly[-(self.monthly['day']<20)]
        self.monthly = self.monthly.drop(columns=['Open', 'High', 'Low', 'Close', 
                                                  'Volume', 'day'])
        self.monthly['Open_filter'] = self.monthly.Open_filter.astype(int)
        
        # self.monthly_mom : 月度环比 DataFrame
        
        self.monthly_mom = self.monthly.copy()
        self.monthly_mom['Open_mom'] = MOM(self.monthly_mom.Open_filter.values)
        self.monthly_mom = self.monthly_mom.drop(columns=['Open_filter'])
        self.monthly_mom.dropna(inplace = True)
        
        # self.monthly_yoy : 月度同比 DataFrame
        
        self.monthly_yoy = self.monthly.copy()
        self.monthly_yoy['Open_yoy'] = YOY(self.monthly_yoy.Open_filter.values)
        self.monthly_yoy = self.monthly_yoy.drop(columns=['Open_filter'])
        self.monthly_yoy.dropna(inplace = True)
    
    def plot_daily(self):
        '''
        画日度价格数据
        '''
        plt.figure(figsize=(12, 4))
        plt.plot(self.daily.trade_date, self.daily.Open)
        plt.title(self.code + ' Daily')
        plt.xlabel('Trade date')
        plt.ylabel('Open price')
        plt.grid()
        plt.show()
    
    def plot_monthly(self):
        '''
        画月度价格数据
        '''
        plt.figure(figsize=(12, 4))
        plt.plot(self.monthly.trade_date, self.monthly.Open_filter)
        plt.title(self.code + ' Monthly')
        plt.xlabel('Trade date')
        plt.ylabel('Open price')
        plt.grid()
        plt.show()
    
    def plot_monthly_mom(self):
        '''
        画月度环比数据
        '''
        plt.figure(figsize=(12, 4))
        plt.plot(self.monthly_mom.trade_date, self.monthly_mom.Open_mom)
        plt.title(self.code + ' Monthly Month-Over-Month')
        plt.xlabel('Trade date')
        plt.ylabel('Open price Change')
        plt.grid()
        plt.show()

    def plot_monthly_yoy(self):
        '''
        画月度同比数据
        '''
        plt.figure(figsize=(12, 4))
        plt.plot(self.monthly_yoy.trade_date, self.monthly_yoy.Open_yoy)
        plt.title(self.code + ' Monthly Year-Over-Year')
        plt.xlabel('Trade date')
        plt.ylabel('Open price Change')
        plt.grid()
        plt.show()

In [None]:
class Macro:
    
    '''
    Macro 相关操作
    '''
    
    def __init__(self, filename, year_range):    
        
        self.period = pd.period_range(str(year_range[0])+'-1', str(year_range[1])+'-12', freq='M')
        
        self.data = pd.read_excel('MacroData/' + filename + '.xls')  # 读取 Macro 数据文档
        self.data = self.data.iloc[1:,:]
        self.data['date'] = pd.to_datetime(self.data.iloc[:,0])
        self.data = self.data.iloc[:,1:]

        self.data['year'] = self.data.date.dt.year
        self.data['month'] = self.data.date.dt.month
        
        self.data = self.data[(self.data['year'] >= year_range[0]) 
                              & (self.data['year'] <= year_range[1])]

In [None]:
# 螺纹钢数据
RB = CommodityQuotation('RB', year_range)

# 焦炭数据
J = CommodityQuotation('J', year_range)

In [None]:
# 螺纹钢库存数据
StorageSH = Macro('库存', year_range)
StorageSH.data['库存:螺纹钢(含上海全部仓库)'] = StorageSH.data['库存:螺纹钢(含上海全部仓库)'].astype(float)
StorageSH.data = StorageSH.data.groupby(['year', 'month']).mean().reset_index()
StorageSH.data['库存'] = StorageSH.data['库存:螺纹钢(含上海全部仓库)'].values
StorageSH.data = StorageSH.data.drop(columns = ['库存:螺纹钢(含上海全部仓库)'])

# 冰箱月产量数据
Fridge = Macro('国内冰箱月产量', year_range)
Fridge.data = Fridge.data.drop(columns = ['产量:家用电冰箱:当月同比'])
Fridge.data['date'] = Fridge.data.date.apply(lambda x: x.strftime('%Y-%m'))
Fridge.data = Fridge.data.set_index(['date']).sort_index().reindex(Fridge.period.astype(str))
Fridge.data['year'] = Fridge.period.year
Fridge.data['month'] = Fridge.period.month
Fridge.data['产量:家用电冰箱:当月值'] = Fridge.data['产量:家用电冰箱:当月值'].astype('float').fillna(method = 'bfill')
Fridge.data['冰箱'] = Fridge.data['产量:家用电冰箱:当月值'].values
Fridge.data = Fridge.data.drop(columns = ['产量:家用电冰箱:当月值'])
Fridge.data = Fridge.data.reset_index(drop = True)

# 空调月产量数据
AC = Macro('国内空调月产量', year_range)
AC.data = AC.data.drop(columns = ['产量:空调:当月同比'])
AC.data['date'] = AC.data.date.apply(lambda x: x.strftime('%Y-%m'))
AC.data = AC.data.set_index(['date']).sort_index().reindex(AC.period.astype(str))
AC.data['year'] = AC.period.year
AC.data['month'] = AC.period.month
AC.data['产量:空调:当月值'] = AC.data['产量:空调:当月值'].astype('float').fillna(method = 'bfill')
AC.data['空调'] = AC.data['产量:空调:当月值'].values
AC.data = AC.data.drop(columns = ['产量:空调:当月值'])
AC.data = AC.data.reset_index(drop = True)

# 洗衣机月产量数据
Laundry = Macro('国内洗衣机月产量', year_range)
Laundry.data = Laundry.data.drop(columns = ['产量:家用洗衣机:当月同比'])
Laundry.data['date'] = Laundry.data.date.apply(lambda x: x.strftime('%Y-%m'))
Laundry.data = Laundry.data.set_index(['date']).sort_index().reindex(Laundry.period.astype(str))
Laundry.data['year'] = Laundry.period.year
Laundry.data['month'] = Laundry.period.month
Laundry.data['产量:家用洗衣机:当月值'] = Laundry.data['产量:家用洗衣机:当月值'].astype('float').fillna(method = 'bfill')
Laundry.data['洗衣机'] = Laundry.data['产量:家用洗衣机:当月值'].values
Laundry.data = Laundry.data.drop(columns = ['产量:家用洗衣机:当月值'])
Laundry.data = Laundry.data.reset_index(drop = True)

# 汽车月产量数据
Vehicle = Macro('国内汽车月产量 copy', year_range)
Vehicle.data = Vehicle.data.drop(columns = ['产量:汽车:当月同比'])
Vehicle.data['date'] = Vehicle.data.date.apply(lambda x: x.strftime('%Y-%m'))
Vehicle.data = Vehicle.data.set_index(['date']).sort_index().reindex(Vehicle.period.astype(str))
Vehicle.data['year'] = Vehicle.period.year
Vehicle.data['month'] = Vehicle.period.month
Vehicle.data['产量:汽车:当月值'] = Vehicle.data['产量:汽车:当月值'].astype('float').fillna(method = 'bfill')
Vehicle.data['汽车'] = Vehicle.data['产量:汽车:当月值'].values
Vehicle.data = Vehicle.data.drop(columns = ['产量:汽车:当月值'])
Vehicle.data = Vehicle.data.reset_index(drop = True)

# 商品房数据
Housing = Macro('30大中城市商品房成交(周)', year_range)
Housing.data = Housing.data.drop(columns = ['30大中城市:商品房成交面积:当周值:环比', 
                                            '30大中城市:商品房成交面积:当周值:同比',
                                            '30大中城市:商品房成交套数:当周值:环比', 
                                            '30大中城市:商品房成交套数:当周值:同比'])
Housing.data['30大中城市:商品房成交面积:当周值'] = Housing.data['30大中城市:商品房成交面积:当周值'].astype(float)
Housing.data['30大中城市:商品房成交套数:当周值'] = Housing.data['30大中城市:商品房成交套数:当周值'].astype(float)
Housing.data = Housing.data.groupby(['year', 'month']).mean().reset_index()
Housing.data.rename(columns={'30大中城市:商品房成交面积:当周值':'商品房月成交面积', 
                             '30大中城市:商品房成交套数:当周值':'商品房月成交套数'}, 
                    inplace=True)
Housing.data['商品房月成交面积'] = Housing.data['商品房月成交面积'].values
Housing.data['商品房月成交套数'] = Housing.data['商品房月成交套数'].values

# 国债到期收益率，社会融资规模，M0/M1/M2数据
IR = Macro('信贷利率', year_range)
IR.data['中债国债到期收益率_filter'] = HP(IR.data['中债国债到期收益率:10年'], lamb = 100)
IR.data = IR.data.dropna(subset = ['M0'])
IR.data['中债国债到期收益率'] = (IR.data['中债国债到期收益率_filter'].values)
IR.data['社会融资规模'] = (IR.data['社会融资规模:当月值'].values)
IR.data['M0'] = (IR.data['M0'].values)
IR.data['M1'] = (IR.data['M1'].values)
IR.data['M2'] = (IR.data['M2'].values)
IR.data = IR.data.drop(columns=['社会融资规模:当月值', 
                      '中债国债到期收益率:10年', '中债国债到期收益率_filter', 'date']).reset_index(drop=True)

In [None]:
# 月度绝对量DataFrame

RBMacro = RB.monthly \
    .merge(J.monthly, on = ['year', 'month']) \
    .merge(StorageSH.data) \
    .merge(Fridge.data) \
    .merge(AC.data) \
    .merge(Laundry.data) \
    .merge(Vehicle.data) \
    .merge(Housing.data) \
    .merge(IR.data) \
    .drop(columns = ['trade_date_x', 'trade_date_y'])
RBMacro = RBMacro.reset_index(drop = True).astype(float)

In [None]:
def corrMatrix(col1, col2, N):
    '''
    返回RBMacro中col1月度环比增长率和col2滞后增长率的相关性矩阵，矩阵大小为N*N
    '''
    cm = np.zeros((N,N))
    for i in range(N):
        for j in range(N):
            s1 = MOM(RBMacro[col1].values)  # col1 月度环比增长率
            s2 = lagging(XOX(RBMacro[col2].values, i+1), j+1)  # col2滞后增长率
            cm[i][j] = pearson(s1, s2)
    return pd.DataFrame(cm).rename(lambda x: 'len ' + str(x+1), axis = 0) \
                           .rename(lambda x: 'lag ' + str(x+1), axis = 1)