In [1]:
!pip3 install yfinance
!pip3 install scipy
!pip3 install pandas



In [2]:
import pandas as pd
from datetime import datetime, timedelta
from collections import Counter
import yfinance as yf
import scipy
import operator

In [3]:
datapath = "dataset/method1_analysis_result_kaggle.csv"
df = pd.read_csv(datapath)

In [4]:
def get_post_freq(timestamps, interval: timedelta, start_time: datetime = None, end_time: datetime = None) -> pd.DataFrame:
    datetime_list = [datetime.strptime(time, '%Y-%m-%d %H:%M:%S') for time in sorted(timestamps)] # convert string to datetime
    if start_time == None:
        start_time = datetime_list[0]
    if end_time == None:
        end_time = datetime_list[-1] + timedelta(seconds=1)
        
    datetime_list = [elem for elem in datetime_list if elem >= start_time and elem < end_time] # choose timestamps in the range from start_time to end_time
    datetime_list = [(elem - start_time) // interval for elem in datetime_list] # assign each timestamps to each interval
    datetime_list = [start_time + interval * elem for elem in datetime_list] # restore the real timestamps of each interval
    freq_list = list(Counter(datetime_list).items()) # aggregate timestamps into each interval

    if freq_list[-1][0] < end_time - interval: # add end bound
        freq_list.append((end_time - interval, 0))
    if freq_list[0][0] > start_time: # add start bound
        freq_list.insert(0, (start_time, 0))
    
    for i in range(len(freq_list)-2, -1, -1): # add 0s into the interval between two real records
        if freq_list[i + 1][0] - freq_list[i][0] > 2 * interval:
            freq_list.insert(i + 1, (freq_list[i + 1][0] - interval, 0))
            freq_list.insert(i + 1, (freq_list[i][0] + interval, 0))
        elif freq_list[i + 1][0] - freq_list[i][0] == 2 * interval:
            freq_list.insert(i + 1, (freq_list[i][0] + interval, 0))
    
    freq_df = pd.DataFrame(freq_list, columns=["timestamp", "frequency"])
    return freq_df

In [5]:
target_ticker_symbol = "GME"
df = df[~df[target_ticker_symbol].isna()]
df_neutral = df[df[target_ticker_symbol] == 0]
df_positive = df[df[target_ticker_symbol] == 1]
df_negative = df[df[target_ticker_symbol] == 2]
interval = timedelta(days=1)
df_freq = get_post_freq(df["timestamp"], interval)
df_freq_neutral = get_post_freq(df_neutral["timestamp"], interval)
df_freq_positive = get_post_freq(df_positive["timestamp"], interval)
df_freq_negative = get_post_freq(df_negative["timestamp"], interval)
# get stock price data
ticker = yf.Ticker(target_ticker_symbol)
start_date = sorted(df["timestamp"])[0].split(" ")[0]
end_date = sorted(df["timestamp"])[-1].split(" ")[0]
end_date = (datetime.strptime(end_date, "%Y-%m-%d") + timedelta(days=20)).strftime("%Y-%m-%d")
df_stock = ticker.history(start = start_date, end = end_date, interval = "1d")

### Get correlation between post frequency and Open Difference/Volume with n shift days

In [6]:
# stock_data should contain the data with a timestamp which is 1 interval later than end_time
def get_correlation(frequency_data, stock_data, start_time, end_time, mode="all", shift=timedelta(days=0)):
    if mode == "all":
        open_func = abs
    elif mode == "positive":
        open_func = operator.pos
    elif mode == "negative":
        open_func = operator.neg
    else:
        raise ValueError("Wrong mode")

    selected_frequency_data = frequency_data.loc[(frequency_data["timestamp"] >= start_time) & (frequency_data["timestamp"] < end_time)]
    selected_stock_data = stock_data.loc[(stock_data.index.tz_localize(None) >= start_time+shift) & (stock_data.index.tz_localize(None) < end_time+shift+(stock_data.index[1]-stock_data.index[0]))]
    if len(selected_frequency_data) == 0:
        raise ValueError(f"No freqency data in range of [{start_time+shift}, {end_time+shift})")
    if len(selected_stock_data) == 0:
        raise ValueError(f"No stock data in range of [{start_time+shift}, {end_time+shift})")
    elif len(selected_stock_data) == 1:
        raise ValueError(f"Stock data should contain at least 2 data in range of [{start_time+shift}, {end_time+shift})")

    # due to stock_data has no data during weekend, get a new frequency data
    stock_time = [time.date() for time in selected_stock_data.index[:-1]]
    new_freq = []
    cumulative_freq = 0
    i = 0
    for time in stock_time:
        if i >= len(selected_frequency_data["timestamp"]) or selected_frequency_data["timestamp"].iloc[i].date() > time:
            new_freq.append(0)
        else:
            cumulative_freq += selected_frequency_data["frequency"].iloc[i]
            i += 1
            while i < len(selected_frequency_data["timestamp"]) and selected_frequency_data["timestamp"].iloc[i].date() <= time:
                cumulative_freq += selected_frequency_data["frequency"].iloc[i]
                i += 1
            new_freq.append(cumulative_freq)
            cumulative_freq = 0

    # get difference between the next day open and today open
    new_open = []
    for i in range(0, len(selected_stock_data["Open"])-1):
        new_open.append(open_func(df_stock["Open"].iloc[i+1] - df_stock["Open"].iloc[i]))

    # calculate correlation
    open_correlation = scipy.stats.spearmanr(new_freq, new_open)
    volume_correlation = scipy.stats.spearmanr(new_freq, selected_stock_data["Volume"][:-1])

    freq_start_time = selected_frequency_data["timestamp"].iloc[0].to_pydatetime()
    freq_end_time = selected_frequency_data["timestamp"].iloc[-1].to_pydatetime()
    stock_start_time = selected_stock_data.index[0].tz_localize(None).to_pydatetime()
    
    stock_end_time = selected_stock_data.index[-2].tz_localize(None).to_pydatetime()
    
    return freq_start_time, freq_end_time, stock_start_time, stock_end_time, open_correlation, volume_correlation

In [7]:
start_time = datetime.strptime(start_date, "%Y-%m-%d")
end_time = start_time + timedelta(days=20)
freq_start_time, freq_end_time, stock_start_time, stock_end_time, open_correlation, volume_correlation = get_correlation(df_freq, df_stock, start_time, end_time, "all", timedelta(days=0))

In [8]:
print(f"Compare frequency data in time range of [{freq_start_time}, {freq_end_time}] with stock data in range of [{stock_start_time}, {stock_end_time}]")
print(f"The correlation between frequency and Open difference is {open_correlation[0]} with p-value {open_correlation[1]}")
print(f"The correlation between frequency and Volume is {volume_correlation[0]} with p-value {volume_correlation[1]}")

Compare frequency data in time range of [2021-01-28 09:08:18, 2021-02-16 09:08:18] with stock data in range of [2021-01-28 00:00:00, 2021-02-16 00:00:00]
The correlation between frequency and Open difference is 0.8406593406593407 with p-value 0.00031907124069371434
The correlation between frequency and Volume is 0.7857142857142857 with p-value 0.0014541896038438224


In [9]:
start_time = datetime.strptime(start_date, "%Y-%m-%d")
end_time = start_time + timedelta(days=20)
freq_start_time, freq_end_time, stock_start_time, stock_end_time, open_correlation, volume_correlation = get_correlation(df_freq_positive, df_stock, start_time, end_time, "positive", timedelta(days=0))

In [10]:
print(f"Compare positive frequency data in time range of [{freq_start_time}, {freq_end_time}] with stock data in range of [{stock_start_time}, {stock_end_time}]")
print(f"The correlation between frequency and Open difference is {open_correlation[0]} with p-value {open_correlation[1]}")
print(f"The correlation between frequency and Volume is {volume_correlation[0]} with p-value {volume_correlation[1]}")

Compare positive frequency data in time range of [2021-01-28 09:08:18, 2021-02-16 09:08:18] with stock data in range of [2021-01-28 00:00:00, 2021-02-16 00:00:00]
The correlation between frequency and Open difference is -0.2087912087912088 with p-value 0.493621819356885
The correlation between frequency and Volume is 0.7252747252747254 with p-value 0.005023237655349918


In [11]:
start_time = datetime.strptime(start_date, "%Y-%m-%d")
end_time = start_time + timedelta(days=20)
freq_start_time, freq_end_time, stock_start_time, stock_end_time, open_correlation, volume_correlation = get_correlation(df_freq_negative, df_stock, start_time, end_time, "negative", timedelta(days=0))

In [12]:
print(f"Compare negative frequency data in time range of [{freq_start_time}, {freq_end_time}] with stock data in range of [{stock_start_time}, {stock_end_time}]")
print(f"The correlation between frequency and Open difference is {open_correlation[0]} with p-value {open_correlation[1]}")
print(f"The correlation between frequency and Volume is {volume_correlation[0]} with p-value {volume_correlation[1]}")

Compare negative frequency data in time range of [2021-01-28 09:09:21, 2021-02-16 09:09:21] with stock data in range of [2021-01-28 00:00:00, 2021-02-16 00:00:00]
The correlation between frequency and Open difference is 0.16483516483516483 with p-value 0.590478988652005
The correlation between frequency and Volume is 0.7912087912087913 with p-value 0.0012750239017411194
