In [113]:
import pandas as pd
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.stattools import adfuller
import numpy as np

In [138]:
#importing dataset
companies_with_cap = pd.read_csv('../data/companies_after_2005.csv', encoding='latin1')
df = pd.read_csv('../data/new_dataset.csv', encoding='latin1')

In [139]:
#merging datasets to get companies' capitalization label
companies_with_cap.rename(columns={'Symbol': 'symbol'}, inplace=True)
df = df.merge(companies_with_cap[['symbol', 'capitalization']], on='symbol', how='left')
#dropping the NaN values before running the test
df.dropna(inplace=True)
df.head()

Unnamed: 0.1,Unnamed: 0,index,symbol,date,svi,edgar,price,volume,WeeklyReturns,returns,capitalization
1,1,1,MSFT,2005-01-09,33079,3220,26.12,379712121.0,-2.062242,-2.0,high
2,2,2,MSFT,2005-01-16,20892,3030,25.65,266617523.0,-1.799387,-2.0,high
3,3,3,MSFT,2005-01-23,34820,3539,26.18,409844550.0,2.066277,2.0,high
4,4,4,MSFT,2005-01-30,26115,4112,26.32,347830186.0,0.534759,1.0,high
5,5,5,MSFT,2005-02-06,27856,3502,25.97,360145305.0,-1.329787,-1.0,high


In [116]:
#Function for the stationary test for each company
def check_stationarity_for_each_company(data, company_column, column1, column2, alpha=0.05):
    """
    Perform Augmented Dickey-Fuller test for stationarity on two columns of a time series for each company.

    Parameters:
    - data: DataFrame containing time series data.
    - company_column: Name of the column containing company identifiers.
    - column1: Name of the first column to test for stationarity.
    - column2: Name of the second column to test for stationarity.
    - alpha: Significance level for the test.

    Returns:
    - stationarity_results: DataFrame containing ADF test results and stationarity indicator for each company and variable.
    """

    # Initialize an empty DataFrame to store results
    stationarity_results = pd.DataFrame(columns=[company_column, f"{column1}_ADF_Statistic", f"{column1}_P-Value",
                                                  f"{column2}_ADF_Statistic", f"{column2}_P-Value", f"{column1}_Stationary", f"{column2}_Stationary"])

    # Iterate over unique companies
    for company in data[company_column].unique():
        company_data = data[data[company_column] == company]

        # Extract the time series data for each column
        ts1 = company_data[column1]
        ts2 = company_data[column2]

        # Perform ADF test for each column
        result1 = adfuller(ts1)
        result2 = adfuller(ts2)

        # Add results to the DataFrame
        stationarity_results = stationarity_results.append({
            company_column: company,
            f"{column1}_ADF_Statistic": result1[0],
            f"{column1}_P-Value": result1[1],
            f"{column2}_ADF_Statistic": result2[0],
            f"{column2}_P-Value": result2[1],
            f"{column1}_Stationary": 1 if result1[1] <= alpha else 0,
            f"{column2}_Stationary": 1 if result2[1] <= alpha else 0
        }, ignore_index=True)

    return stationarity_results

  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationa

In [None]:
#checking stationarity of the time series 
check_stationarity_for_each_company(df, 'symbol', 'volume', 'WeeklyReturns')

In [None]:
#checking stationarity of the time series
check_stationarity_for_each_company(df, 'symbol', 'svi', 'edgar')

In [140]:
#applying one-step differencing 
columns_to_difference = ['svi', 'edgar', 'price', 'volume', 'WeeklyReturns', 'returns']

# Create a new DataFrame to store the final results
df_final = pd.DataFrame()

# Iterate over each symbol and apply differencing
for symbol, group in df.groupby('symbol'):
    df_diff = group.copy()
    df_diff[columns_to_difference] = group[columns_to_difference].diff().dropna()
    df_final = pd.concat([df_final, df_diff], ignore_index=True)

# Keep non-numerical columns in the final DataFrame
non_numerical_columns = ['Unnamed: 0', 'index', 'symbol', 'date', 'capitalization']
df_final[non_numerical_columns] = df[non_numerical_columns]

# Drop NaN values resulting from differencing
df_final = df_final.dropna()

In [137]:
#checking the stationarity again
check_stationarity_for_each_company(df, 'symbol', 'svi', 'edgar', alpha=0.05)

  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationarity_results = stationarity_results.append({
  stationa

Unnamed: 0,symbol,svi_ADF_Statistic,svi_P-Value,edgar_ADF_Statistic,edgar_P-Value,svi_Stationary,edgar_Stationary
0,MSFT,-1.741218,0.410011,-3.525858,0.007347,0,1
1,AAPL,-2.403381,0.140785,-1.832679,0.364363,0,0
2,NVDA,-0.749656,0.833465,-1.195702,0.675442,0,0
3,AMZN,-1.665947,0.448737,-2.070649,0.256479,0,0
4,GOOG,-1.168456,0.687016,-3.762062,0.003320,0,1
...,...,...,...,...,...,...,...
285,ADI,-0.097238,0.949713,-1.433386,0.566170,0,0
286,PH,-1.625990,0.469591,-2.265015,0.183500,0,0
287,EOG,-3.152770,0.022887,-2.217694,0.199886,1,0
288,VLO,-1.331729,0.614534,-2.024949,0.275742,0,0


In [None]:
#checking the stationarity again
check_stationarity_for_each_company(df, 'symbol', 'volume', 'WeeklyReturns', alpha=0.05)

In [141]:
#updating df
df = df_final
df

Unnamed: 0.1,Unnamed: 0,index,symbol,date,svi,edgar,price,volume,WeeklyReturns,returns,capitalization
1,1.0,1.0,MSFT,2005-01-09,-11200.0,102.0,-0.25000,728200.0,1.772422,2.0,high
2,2.0,2.0,MSFT,2005-01-16,8400.0,29.0,-0.19000,-1301300.0,0.262227,0.0,high
3,3.0,3.0,MSFT,2005-01-23,8400.0,73.0,0.84000,-1659100.0,4.758649,5.0,high
4,4.0,4.0,MSFT,2005-01-30,-2800.0,45.0,1.27000,1122700.0,1.769926,2.0,high
5,5.0,5.0,MSFT,2005-02-06,0.0,335.0,-0.15000,6275300.0,-6.289393,-7.0,high
...,...,...,...,...,...,...,...,...,...,...,...
278685,278685.0,376548.0,HES,2007-10-14,16345.0,-2134.0,1.01998,-3596352.0,2.939887,3.0,high
278686,278686.0,376549.0,HES,2007-10-21,-3269.0,-1737.0,-0.22999,959397.0,-0.907903,-1.0,high
278687,278687.0,376550.0,HES,2007-10-28,6538.0,1646.0,-8.42000,2444590.0,-5.923049,-6.0,high
278688,278688.0,376551.0,HES,2007-11-04,-6538.0,2177.0,4.67000,5289693.0,9.685275,10.0,high


In [142]:
#granger test function
def test_results_int(df, lag_num, verbose):
    results_table = []

    for symbol in df['symbol'].unique():
        # Extract data for the current company
        company_data = df[df['symbol'] == symbol]

        # Granger causality test for returns and svi
        returns_svi_test = grangercausalitytests(company_data[['WeeklyReturns', 'svi']], lag_num, verbose=verbose)
        returns_svi_p_values = returns_svi_test[lag_num][0]['ssr_ftest'][1]
        returns_svi_conclusion = 1 if returns_svi_p_values < 0.05 else 0

        # Granger causality test for returns and edgar
        returns_edgar_test = grangercausalitytests(company_data[['WeeklyReturns', 'edgar']], lag_num, verbose=verbose)
        returns_edgar_p_values = returns_edgar_test[lag_num][0]['ssr_ftest'][1]
        returns_edgar_conclusion = 1 if returns_edgar_p_values < 0.05 else 0

        # Granger causality test for volume and svi
        volume_svi_test = grangercausalitytests(company_data[['volume', 'svi']], lag_num, verbose=verbose)
        volume_svi_p_values = volume_svi_test[lag_num][0]['ssr_ftest'][1]
        volume_svi_conclusion = 1 if volume_svi_p_values < 0.05 else 0

        # Granger causality test for volume and edgar
        volume_edgar_test = grangercausalitytests(company_data[['volume', 'edgar']], lag_num, verbose=verbose)
        volume_edgar_p_values = volume_edgar_test[lag_num][0]['ssr_ftest'][1]
        volume_edgar_conclusion = 1 if volume_edgar_p_values < 0.05 else 0

        # Extract capitalization index for the current company
        capitalization = company_data['capitalization'].values[0]

        # Append the results to the table
        results_table.append({
            'Symbol': symbol,
            'Capitalization': capitalization,
            'Returns-SVI Test Results': returns_svi_p_values,
            'Returns-SVI Conclusion': returns_svi_conclusion,
            'Returns-EDGAR Test Results': returns_edgar_p_values,
            'Returns-EDGAR Conclusion': returns_edgar_conclusion,
            'Volume-SVI Test Results': volume_svi_p_values,
            'Volume-SVI Conclusion': volume_svi_conclusion,
            'Volume-EDGAR Test Results': volume_edgar_p_values,
            'Volume-EDGAR Conclusion': volume_edgar_conclusion,
        })

    # Convert the results to a DataFrame
    results_df = pd.DataFrame(results_table)

    return results_df

In [143]:
#running the function
results = test_results_int(df, 12, False)



In [122]:
#saving results csv
results.to_csv('../data-analysis/granger results.csv')

In [144]:
#separating high cap companies from low cap companies
resultshigh = results[results['Capitalization'] == 'high']
resultslow = results[results['Capitalization'] == 'low']

In [145]:
#calculating percentage for each type of relationship
percentRS = results['Returns-SVI Conclusion'].sum(axis=0)/len(results)
percentRE = results['Returns-EDGAR Conclusion'].sum(axis=0)/len(results)
percentVS = results['Volume-SVI Conclusion'].sum(axis=0)/len(results)
percentVE = results['Volume-EDGAR Conclusion'].sum(axis=0)/len(results)
percentRShigh = resultshigh['Returns-SVI Conclusion'].sum(axis=0)/len(resultshigh)
percentREhigh = resultshigh['Returns-EDGAR Conclusion'].sum(axis=0)/len(resultshigh)
percentVShigh = resultshigh['Volume-SVI Conclusion'].sum(axis=0)/len(resultshigh)
percentVEhigh = resultshigh['Volume-EDGAR Conclusion'].sum(axis=0)/len(resultshigh)
percentRSlow = resultslow['Returns-SVI Conclusion'].sum(axis=0)/len(resultslow)
percentRElow = resultslow['Returns-EDGAR Conclusion'].sum(axis=0)/len(resultslow)
percentVSlow = resultslow['Volume-SVI Conclusion'].sum(axis=0)/len(resultslow)
percentVElow = resultslow['Volume-EDGAR Conclusion'].sum(axis=0)/len(resultslow)
print (percentRS, percentRE, percentVS, percentVE, percentRShigh, percentREhigh, percentVShigh, 
       percentVEhigh, 
       percentRSlow, percentRElow, percentVSlow, percentVElow)

0.12758620689655173 0.05172413793103448 0.2413793103448276 0.1724137931034483 0.12757201646090535 0.04526748971193416 0.2551440329218107 0.16872427983539096 0.1276595744680851 0.0851063829787234 0.1702127659574468 0.19148936170212766
