In [4]:

#The level of statistical significance is often expressed as a p-value between 0 and 1. 
#The smaller the p-value, the stronger the evidence that you should reject the null hypothesis.

# p-value less than 0.05 (typically ≤ 0.05)
        #statistically significant
        #evidence against the null hypothesis
        #less than a 5% probability the null is correct
        #reject the null hypothesis
        #accept the alternative hypothesis
        
 # p-value higher than 0.05 (> 0.05)     
    # not statistically significant 
    # strong evidence for the null hypothesis
    #retain/ (fail to reject it) the null hypothesis
    #reject the alternative hypothesis
    

In [5]:
# example in the book
# We see that with a value of only 0.444, the t-ratio
# is very small, which indicates that the corresponding null hypothesis H0 : a = 0 is likely not to be
# rejected. Turning to the slope estimate for 'ret_future', the t-ratio is high with 146.543 suggesting that
# H0 : b = 0 is to be rejected against the alternative hypothesis of H1 : b 6= 0.

In [6]:
# The p-values presented
# in the fourth column, 'P>|t|', confirm our expectations: the p-value for the constant is considerably
# larger than 0.1, meaning that the corresponding t-statistic is not even significant at a 10% level; in
# comparison, the p-value for the slope coefficient is zero to, at least, three decimal places. Thus, the
# null hypothesis for the slope coefficient is rejected at the 1% level.

In [7]:
#Futures is set
#as explanatory variable and Spot is the dependent variable.

In [3]:
import pandas as pd
import numpy as np


In [4]:
import statsmodels.formula.api as smf

In [9]:
path= 'C:\\Users\\Rameez PC\\Documents\\GitHub\\Econometrics\\hypothesis testing a\\'

In [10]:
file= pd.read_excel(path + 'SandPhedge.xls', index_col= 0)

In [11]:
file.head()

Unnamed: 0_level_0,Spot,Futures
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2002-02-01,1106.73,1106.9
2002-03-01,1147.39,1149.2
2002-04-01,1076.92,1077.2
2002-05-01,1067.14,1067.5
2002-06-01,989.82,990.1


In [12]:

# We can follow the steps described above and specify
# 'ret_spot' as the dependent variable and ’ret_future’ as the independent variable.

In [13]:
formula = 'Spot ~ Futures'
# ß is Futures. Null Hypothesis is ß=1. 
hypotheses = 'Futures = 1'

In [14]:
# we repeat the regression procedure
#again.
results = smf.ols(formula, file).fit()
# We can, of course, calculate the test statistics for this hypothesis test by hand; however, it is
# easier if we let Python do this work. For this we use the Statsmodels function 'f_test'.
f_test = results.f_test(hypotheses)
print(f_test)

<F test: F=array([[5.42447367]]), p=0.02136355027890708, df_denom=133, df_num=1>


In [15]:
# First we find the test statistics: 'F=array([[
# 5.42447367]])', which states that the value of the F-test is around 5.42. The corresponding p-value
# is 0.02, stated in the next position. As it is smaller than 0.10, we clearly can reject the null hypothesis
# that the coefficient estimate is equal to 1. The last two numbers present the total number of
# observations and the degrees of freedom for this test respectively.

In [16]:
# To generate a new data series of continuously compounded returns, we define a new function
# in Python to achieve this calculation (see In [4]). Specifically, we create a user-defined function
# called LogDiff where the input parameter is a Pandas column.12 To calculate the log difference,
# we first obtain a column which lags one period. This can be done by typing the command x.shift(1).
# Then, we take the log transformation of the difference between the original and lagged series. The
# function log comes from the NumPy library, so we can use this function by entering the command
# np.log(x/x.shift(1)). Next, we want to output the result expressed as a percentage. Therefore, the
# column is scaled by multiplying it by 100. It is worth noting that the first price observation will
# be lost when return series is computed. However, the first data point will still be displayed in the
# DataFrame as nan since Python keeps the length of the DataFrame intact. To avoid this pitfall, we
# employ the Pandas dropna() function remove it. Finally, we return the newly-calculated column.
# We have to call the function itself outside the function content. This can be done by typing the
# command LogDiff(data[’Spot’]). Meanwhile, we also rebuild the data DataFrame which previously
# stored the prices data. A newly created DataFrame can be defined by using the Pandas DataFrame
# function. In the bracket of the DataFrame function, we specify two column names: ret_spot and
# ret_future respectively. To fill out their values, the LogDiff function is called to compute the new
# series. Once finished, we can repeat the process as stated above to print the finalised DataFrame. As
#can be seen in Out [4], the newly created DataFrame starts from October 1997 instead of September
# 1997 and the column names have changed from Spot and Futuresto ret_spot and ret_future.

In [17]:
def LogDiff(x):
    x_diff = 100*np.log(x/x.shift(1))
    x_diff = x_diff.dropna()
    return x_diff

In [18]:
data = pd.DataFrame({'ret_spot' : LogDiff(file['Spot']),
'ret_future':LogDiff(file['Futures'])})

In [19]:
formula = 'ret_spot ~ ret_future'
hypotheses = 'ret_future = 1'

In [20]:
results = smf.ols(formula, data).fit()
f_test = results.f_test(hypotheses)
print(f_test)

<F test: F=array([[1.54521196]]), p=0.21604712494294365, df_denom=132, df_num=1>


In [21]:
# With an F-statistic of 0.02 and a corresponding p-value of nearly 0.8, we find that the null hypothesis
# is failed to be rejected at the 1% significance level.