# Factor Selection
## Growth Factor

In [125]:
from WindPy import w
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.stats.api import het_white, het_breuschpagan
from statsmodels.stats.sandwich_covariance import cov_hac
from statsmodels.tsa.seasonal import STL

In [2]:
w.start()

Wind.Cosmos.Base V1.7 compiled time is Apr 17 2023, BuildType:Release, CPUArch:X64, GCC Version:Apple LLVM 13.0.0 (clang-1300.0.29.30)
Welcome to use Wind Quant API for Python (WindPy)!

COPYRIGHT (C) 2021 WIND INFORMATION CO., LTD. ALL RIGHTS RESERVED.
IN NO CIRCUMSTANCE SHALL WIND BE RESPONSIBLE FOR ANY DAMAGES OR LOSSES CAUSED BY USING WIND QUANT API FOR Python.


.ErrorCode=0
.Data=[OK!]

In [82]:
start_date = "2008-12-01" # actually from 2010, extract data for preprocessing
end_date = "2024-05-31"

In [20]:
# takes a dict of indicators and their corresponding ticker in Wind to
# create a DataFrame containing the data extracted from Wind API
def extract_inds(inds):
    df = pd.DataFrame()
    for ind in inds:
        raw = w.edb(inds[ind], start_date, end_date)
        # Extract data from the Wind API response
        times = raw.Times
        data = raw.Data[0]
        # Convert times to pandas datetime format
        times = pd.to_datetime([str(time) for time in times], format="%Y-%m-%d")
        
        # Create a temporary DataFrame for the current indicator
        temp_df = pd.DataFrame(data, index=times, columns=[ind])
        
        # Combine the temporary DataFrame with the main DataFrame
        if df.empty:
            df = temp_df
        else:
            df = df.join(temp_df, how='outer')
    return df

In [28]:
def lag_data(df, periods=1):
    df_lagged = df.shift(periods)
    df_lagged = df_lagged.iloc[periods:]  # Remove the first rows
    return df_lagged

In [6]:
# STL seasonality processing
def apply_stl(df, columns, seasonal=13):
    df_adjusted = df.copy()
    for column in columns:
        stl = STL(df[column], seasonal=seasonal)
        result = stl.fit()
        df_adjusted[column] = result.seasonal  # Replace with seasonally adjusted component
    return df_adjusted

In [7]:
# YoY Difference Processing
def to_yoy(df, columns):
    df_yoy = df.copy()
    for column in columns:
        df_yoy[column] = df[column] - df[column].shift(12)
    df_yoy = df_yoy.iloc[12:]  # Remove the first 12 rows
    return df_yoy

In [109]:
# Get USD/CNH Exchange Rate
fx_data = w.edb("M0000185", "2009-01-01", end_date, Period="M")
# Parse the data
times = fx_data.Times
data = fx_data.Data[0]
times = pd.to_datetime([str(time) for time in times], format="%Y-%m-%d")

# Construct the DataFrame
df_forex = pd.DataFrame(data, index=times, columns=["USD/CNY Central Parity Rate"])
df_forex = to_yoy(df_forex, df_forex.columns)

# Display the DataFrame
df_forex

Unnamed: 0,USD/CNY Central Parity Rate
2010-01-29,-0.0110
2010-02-26,-0.0110
2010-03-31,-0.0096
2010-04-30,0.0013
2010-05-31,-0.0044
...,...
2024-01-31,0.3435
2024-02-29,0.1517
2024-03-29,0.2233
2024-04-30,0.1823


In [139]:
# for factor selection, regress on the exchang rate to observe Beta, T-Value, and R-Squared
# and record the median values after 1000 iterations
def regress(indicator_df, forex_df, target_column='USD/CNY Central Parity Rate', n_iterations=1000, min_period_length=24):
    # Initialize a DataFrame to store regression results
    df_forex.index = indicator_df.index
    results = []

    for i in range(n_iterations):
        # Resample with replacement
        sample_indices = np.random.choice(indicator_df.index, size=len(indicator_df), replace=True)
        sample_growth_df = indicator_df.loc[sample_indices]
        sample_forex_df = forex_df.loc[sample_indices]
        
        # Randomly select start and end dates
        start_idx = np.random.randint(0, len(sample_growth_df) - min_period_length)
        end_idx = start_idx + min_period_length
        
        # Subset the data to ensure period longer than 2 years
        sample_growth_period = sample_growth_df.iloc[start_idx:end_idx]
        sample_forex_period = sample_forex_df.iloc[start_idx:end_idx]
        
        # Perform regression for each indicator
        for indicator in indicator_df.columns:
            X = sample_growth_period[[indicator]]
            X = sm.add_constant(X)
            y = sample_forex_period[target_column]
            
            model = sm.OLS(y, X).fit(cov_type='HAC', cov_kwds={'maxlags':1})
            beta = model.params[indicator]
            t_value = model.tvalues[indicator]
            r_squared = model.rsquared
            
            # Append the results
            results.append({
                'Indicator': indicator,
                'Beta': beta,
                'T-Value': t_value,
                'R-Squared(%)': r_squared * 100  # Convert to percentage
            })
            
    # Convert results to DataFrame
    growth_results_df = pd.DataFrame(results)

    # Compute median values
    growth_median_results = growth_results_df.groupby('Indicator').median().reset_index()

    # Format the median results
    growth_median_results['Beta'] = growth_median_results['Beta'].map("{:.2f}".format)
    growth_median_results['T-Value'] = growth_median_results['T-Value'].map("{:.2f}".format)
    growth_median_results['R-Squared(%)'] = growth_median_results['R-Squared(%)'].map("{:.2f}%".format)

    return growth_median_results

In [9]:
# consider key growth indicators for China
china_growth_inds = {
    # production method
    # general macro indicators
    'GDP_China(%)': 'M0039354',
    'Industrial_Growth(%)': 'M0000545',
    'PMI': 'M0017126',
    'PMI_manufacture': 'M0017127',
    'PMI_new_orders': 'M0017128',
    # by industry
    'electricity(%)': 'S0027013',
    'concrete(%)': 'S0027703',
    'steel(%)': 'S0027375',
    'automobile(%)': 'S0027908',
    'railroad_cargo(%)': 'S0036034',
    # expenditure method
    # investments
    'fixed_asset(%)': 'M0000273',
    'real_estate(%)': 'S0029657',
    'infrastructure(%)': 'M5440435',
    'manufacture(%)': 'M0000357',
    # consumption
    'retail(%)': 'M0001428',
    'automobile_sales(%)': 'S6114593',
    'tractor_sales(%)': 'S6002167',
    'commercial_RE_area(%)': 'S0073300',
    'commercial_RE_revenue(%)': 'S0049591',
    # net exports
    'import_export(%)': 'M0000605',
    'export(%)': 'M0000607',
    'import(%)': 'M0000609',
    # income method
    'govt_revenue(%)': 'M0046169',
    'industrials_biz_income(%)': 'M0000555',
    'industrials_tot_profits(%)': 'M0000557'
}
# for standardizing the PMI data
china_pmi_cols = ['PMI', 'PMI_manufacture', 'PMI_new_orders']
# indicate the data cleaning method for each indicator
china_methods = [0, ]

In [66]:
# actual step retrieving data from Wind API
df_china_growth = extract_inds(china_growth_inds)
df_china_growth = lag_data(df_china_growth)  # lag data by 1 period
df_china_growth = to_yoy(df_china_growth, china_pmi_cols)
df_china_growth = df_china_growth.bfill().interpolate(method="linear")  # interpolate missing values
df_china_growth = apply_stl(df_china_growth, df_china_growth.columns[1:], seasonal=13)
df_china_growth.head(10)


Unnamed: 0,GDP_China(%),Industrial_Growth(%),PMI,PMI_manufacture,PMI_new_orders,electricity(%),concrete(%),steel(%),automobile(%),railroad_cargo(%),...,automobile_sales(%),tractor_sales(%),commercial_RE_area(%),commercial_RE_revenue(%),import_export(%),export(%),import(%),govt_revenue(%),industrials_biz_income(%),industrials_tot_profits(%)
2010-01-31,11.9,-0.09697,2.67693,4.250446,4.295375,0.601227,-3.171591,0.625182,19.132004,1.014852,...,8.980903,7.264556,1.598908,3.587245,-5.436913,-7.867885,-2.066218,-7.074421,1.271387,4.131011
2010-02-28,12.2,2.78352,1.274089,2.060007,1.695931,6.630864,8.592328,5.1014,26.032968,1.212978,...,23.171006,46.17735,4.171527,7.164816,3.980726,-1.650477,14.438124,3.291982,-1.650102,2.322865
2010-03-31,12.2,0.323882,0.148882,0.227521,-0.081883,-1.960675,-5.5962,1.990873,-7.204441,0.227684,...,-4.601357,-19.095841,5.233529,8.981636,-0.4924,0.768446,-3.225484,1.876411,-0.776474,6.137579
2010-04-30,12.2,0.471551,-0.634509,-1.42066,-0.819768,0.754225,1.236998,2.104911,-3.118176,0.805208,...,-0.590846,12.352918,3.482161,4.557782,1.135167,-1.911502,4.450236,2.227174,-0.722639,-5.372916
2010-05-31,10.8,-0.273816,-0.712248,-1.000702,-1.176908,0.779088,0.088304,3.276847,-8.572519,-0.218718,...,-7.026914,0.38701,0.731982,1.376766,-0.202149,0.511089,-1.812951,1.657076,-0.592664,-2.811661
2010-06-30,10.8,-0.407686,-0.606715,-0.730296,-1.275922,0.575272,0.129788,1.702735,-7.269195,-0.427904,...,-6.265093,4.947086,-1.558555,-2.483254,4.114401,5.766158,1.343908,1.082689,-0.014208,-0.17499
2010-07-31,10.8,-0.729037,-0.614818,-0.85097,-1.010721,-0.935082,0.042277,-1.700767,-7.755009,-1.470858,...,-5.99801,-6.973331,-2.541324,-4.41225,-0.999995,2.650109,-5.773855,0.13351,-1.131388,-6.994809
2010-08-31,9.9,-0.719035,-0.419007,-0.808744,-0.629003,-0.843911,0.44964,-1.843696,-7.86935,-2.053152,...,-7.70756,-11.450918,-3.347716,-5.273329,-2.195537,1.002463,-6.217931,-0.417073,-0.480287,-4.060486
2010-09-30,9.9,-0.515715,-0.41654,-0.764131,-0.333367,-0.352534,-1.426724,-3.093636,-4.937814,-1.706694,...,-4.119795,-11.718382,-3.411548,-5.254498,1.174948,2.134809,-0.341625,-0.734189,0.271311,-0.970916
2010-10-31,9.9,-0.310241,-0.045951,-0.080899,0.172197,-1.706833,-0.997469,-2.974243,-2.319353,0.474085,...,-1.914242,-9.96274,-2.042651,-3.079432,-2.87076,-2.21034,-3.801352,-0.909689,0.434366,-0.140897


In [None]:
us_growth_inds = {
        'GDP_US': 'G1112986'
}

In [140]:
china_growth_ctrb = regress(df_china_growth, df_forex)
china_growth_ctrb

Unnamed: 0,Indicator,Beta,T-Value,R-Squared(%)
0,GDP_China(%),-0.05,-3.53,21.87%
1,Industrial_Growth(%),-0.0,-0.08,3.27%
2,PMI,0.0,0.03,1.70%
3,PMI_manufacture,-0.0,-0.01,1.57%
4,PMI_new_orders,0.0,0.04,1.53%
5,automobile(%),0.0,0.0,1.23%
6,automobile_sales(%),-0.0,-0.06,2.89%
7,commercial_RE_area(%),-0.0,-0.03,2.60%
8,commercial_RE_revenue(%),-0.0,-0.01,2.28%
9,concrete(%),-0.0,-0.01,2.17%
