In [None]:
import re
import os
import nltk

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from textblob import TextBlob
from bertopic import BERTopic
from wordcloud import WordCloud

# Download necessary NLTK data (if not already done)
'''
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
'''

In [None]:
Trump_df = pd.read_csv("../Data/Tweat_data/Trump_election_market_time.csv", encoding="utf-8")

In [None]:
# Print the first five rows
print(Trump_df.shape)
Trump_df.head()

In [None]:
def clean_text(text):
    # Remove RT (retweet) - "RT"가 포함된 모든 단어 제거
    text = re.sub(r'\b\w*RT\w*\b', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove punctuations, symbols, special characters, and numbers except space
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize text
    words = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.update(['great', 'realdonaldtrump'])
    words = [word for word in words if word.lower() not in stop_words]
    
    # Join the words back into a single string
    cleaned_text = ' '.join(words)
    
    return cleaned_text



Trump_df["cleaned_text"] = Trump_df["text"].apply(clean_text)
print(Trump_df.shape)
Trump_df.head()

In [None]:
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")
docs = Trump_df["cleaned_text"].to_list()
topics, probs = topic_model.fit_transform(docs)

Trump_df['topic'] = topics
Trump_df['topic'].value_counts()


In [None]:
topic_model.get_topic_info()

### CASE1: reduce topics

In [None]:
## CASE1: reduce topics
reduced_new_topics = topic_model.reduce_topics(docs)
print(len(reduced_new_topics.topics_))
pd.DataFrame(reduced_new_topics.topics_).value_counts()

In [None]:
reduced_new_topics.get_topic_info()

In [None]:
import plotly.io as pio
pio.renderers.default='iframe'

In [None]:
fig = reduced_new_topics.visualize_topics()
fig.show()

In [None]:
reduced_new_topics.visualize_barchart(range(0, 10)).show()

In [None]:
topics_over_time = topic_model.topics_over_time(docs=docs,
                                                timestamps=df['date'].to_list(),
                                                global_tuning=True,
                                                evolution_tuning=True,
                                                nr_bins=20)

topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

In [None]:
docs = topic_model.get_document_info(docs)
docs

In [None]:
for topic_id in range(1, 15 + 1):
    topic_data = topics_over_time[topics_over_time['Topic'] == topic_id]
    frequencies = topic_data['Frequency']
    timestamps = topic_data['Timestamp']
    #total_frequencies += frequencies

    gradient = np.gradient(frequencies)  # 빈도의 gradient 계산
    plt.plot(timestamps, gradient, label=f'Topic {topic_id}')


### Frequency and Gradient of Topics

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def plot_total_frequency_with_gradient(topics_over_time, top_n_topics=20):
    plt.figure(figsize=(12, 8))
    
    # 토픽별 빈도를 저장할 딕셔너리 생성
    topic_frequencies = {}

    # 상위 n개의 토픽만을 사용하여 빈도를 더합니다.
    for topic_id in range(1, top_n_topics + 1):
        topic_data = topics_over_time[topics_over_time['Topic'] == topic_id]
        timestamps = topic_data['Timestamp']
        frequencies = topic_data['Frequency']
        
        # 각 타임스탬프에서의 빈도를 더합니다.
        for timestamp, frequency in zip(timestamps, frequencies):
            if timestamp in topic_frequencies:
                topic_frequencies[timestamp] += frequency
            else:
                topic_frequencies[timestamp] = frequency

    # 딕셔너리를 데이터프레임으로 변환하여 그래프를 그립니다.
    df = pd.DataFrame(list(topic_frequencies.items()), columns=['Timestamp', 'Total Frequency'])
    df = df.sort_values(by='Timestamp')
    plt.plot(df['Timestamp'], df['Total Frequency'])

    # 빈도의 gradient를 계산하여 그래프로 표시합니다.
    gradient = np.gradient(df['Total Frequency'])
    plt.plot(df['Timestamp'], gradient, label='Gradient', linestyle='--')

    plt.title('Total Frequency and Gradient of Topics over Time')
    plt.xlabel('Timestamp')
    plt.ylabel('Total Frequency / Gradient')
    plt.legend()
    plt.show()

# 위 함수를 호출하여 각 토픽에 대한 Frequency를 Timestamp마다 총합하고, gradient를 계산하여 그래프로 그립니다.
plot_total_frequency_with_gradient(topics_over_time)


### Tweets + SPY open value & volume

In [None]:
# 토픽 빈도 데이터와 날짜별 open_avg 및 volume 데이터 병합 및 플롯
def plot_total_frequency_with_spy(topics_over_time, daily_data, top_n_topics=20):
    plt.figure(figsize=(14, 8))
    
    # 토픽별 빈도를 저장할 딕셔너리 생성
    topic_frequencies = {}

    # 상위 n개의 토픽만을 사용하여 빈도를 더합니다.
    for topic_id in range(1, top_n_topics + 1):
        topic_data = topics_over_time[topics_over_time['Topic'] == topic_id]
        timestamps = topic_data['Timestamp']
        frequencies = topic_data['Frequency']
        
        # 각 타임스탬프에서의 빈도를 더합니다.
        for timestamp, frequency in zip(timestamps, frequencies):
            if timestamp in topic_frequencies:
                topic_frequencies[timestamp] += frequency
            else:
                topic_frequencies[timestamp] = frequency

    # 딕셔너리를 데이터프레임으로 변환하여 그래프를 그립니다.
    df = pd.DataFrame(list(topic_frequencies.items()), columns=['Timestamp', 'Total Frequency'])
    df = df.sort_values(by='Timestamp')
    
    # 날짜별로 정리된 SPY 데이터와 병합
    df['Date'] = pd.to_datetime(df['Timestamp']).dt.date
    date_set = set(daily_data['Date'])
    df['Closest Date'] = df['Date'].apply(lambda x: find_closest_date(x, date_set))
    
    merged_df = pd.merge(df, daily_data, left_on='Closest Date', right_on='Date', how='left')

    # 플롯
    fig, ax1 = plt.subplots(figsize=(14, 8))

    ax1.plot(merged_df['Timestamp'], merged_df['Total Frequency'], color='tab:blue', label='Total Frequency')
    ax1.set_xlabel('Timestamp')
    ax1.set_ylabel('Total Frequency', color='tab:blue')
    ax1.tick_params(axis='y', labelcolor='tab:blue')

    # 빈도의 gradient를 계산하여 그래프로 표시합니다.
    gradient = np.gradient(merged_df['Total Frequency'])
    ax1.plot(merged_df['Timestamp'], gradient, color='tab:orange', linestyle='--', label='Gradient')

    ax2 = ax1.twinx()
    ax2.bar(merged_df['Timestamp'], merged_df['volume'], color='tab:red', alpha=0.1, label='Volume', width=3)
    ax2.set_ylabel('Volume', color='tab:red')
    ax2.tick_params(axis='y', labelcolor='tab:red')

    ax3 = ax1.twinx()
    ax3.spines['right'].set_position(('outward', 60))  # 다른 축과 분리
    ax3.plot(merged_df['Timestamp'], merged_df['open_avg'], color='tab:green', label='SPY Open Avg')
    ax3.set_ylabel('SPY Open Avg', color='tab:green')
    ax3.tick_params(axis='y', labelcolor='tab:green')

    fig.tight_layout()
    fig.legend(loc='upper left')
    plt.title('Total Frequency, SPY Open Average, and Volume over Time')
    plt.show()



# 함수 호출
plot_total_frequency_with_spy(topics_over_time, daily_data)

### Gradient 끼리만

In [None]:

# Timestamp를 datetime 형식으로 변환
spy_data['Timestamp'] = pd.to_datetime(spy_data['timestamp'])

# 날짜별로 그룹화하여 필요한 값 계산
spy_data['Date'] = spy_data['Timestamp'].dt.date
daily_data = spy_data.groupby('Date').agg({
    'open': ['first', 'last'],
    'high': 'max',
    'low': 'min',
    'volume': 'sum'
})

# 컬럼명 정리
daily_data.columns = ['open_first', 'open_last', 'high', 'low', 'volume']
daily_data['open_avg'] = daily_data[['open_first', 'open_last']].mean(axis=1)
daily_data.reset_index(inplace=True)

# 날짜별 SPY 데이터가 없는 경우 가장 가까운 날짜의 데이터를 찾는 함수
def find_closest_date(target_date, date_set):
    while target_date not in date_set:
        target_date -= timedelta(days=1)
    return target_date

# 토픽 빈도 데이터와 날짜별 open_avg 및 volume 데이터 병합 및 플롯
def plot_gradients_with_spy(topics_over_time, daily_data, top_n_topics=20):
    plt.figure(figsize=(14, 8))
    
    # 토픽별 빈도를 저장할 딕셔너리 생성
    topic_frequencies = {}

    # 상위 n개의 토픽만을 사용하여 빈도를 더합니다.
    for topic_id in range(1, top_n_topics + 1):
        topic_data = topics_over_time[topics_over_time['Topic'] == topic_id]
        timestamps = topic_data['Timestamp']
        frequencies = topic_data['Frequency']
        
        # 각 타임스탬프에서의 빈도를 더합니다.
        for timestamp, frequency in zip(timestamps, frequencies):
            if timestamp in topic_frequencies:
                topic_frequencies[timestamp] += frequency
            else:
                topic_frequencies[timestamp] = frequency

    # 딕셔너리를 데이터프레임으로 변환하여 그래프를 그립니다.
    df = pd.DataFrame(list(topic_frequencies.items()), columns=['Timestamp', 'Total Frequency'])
    df = df.sort_values(by='Timestamp')
    
    # 날짜별로 정리된 SPY 데이터와 병합
    df['Date'] = pd.to_datetime(df['Timestamp']).dt.date
    date_set = set(daily_data['Date'])
    df['Closest Date'] = df['Date'].apply(lambda x: find_closest_date(x, date_set))
    
    merged_df = pd.merge(df, daily_data, left_on='Closest Date', right_on='Date', how='left')

    fig, ax1 = plt.subplots(figsize=(14, 8))

    # open_avg_gradient를 계산하여 그래프로 표시합니다.
    open_avg_gradient = np.zeros(len(merged_df))
    for i in range(len(merged_df)):
        start_idx = max(0, i - 4)
        end_idx = min(len(merged_df), i + 5)
        available_data_count = end_idx - start_idx
        if available_data_count < 9:
            scaling_factor = 9 / available_data_count
            open_avg_gradient[i] = scaling_factor * np.abs(merged_df['open_avg'][end_idx - 1] - merged_df['open_avg'][start_idx])
        else:
            open_avg_gradient[i] = np.abs(merged_df['open_avg'][end_idx - 1] - merged_df['open_avg'][start_idx])

    ax1.plot(merged_df['Timestamp'], open_avg_gradient - np.mean(open_avg_gradient), color='tab:purple', linestyle='--', label='Open Avg Gradient')
    ax1.set_xlabel('Timestamp')
    ax1.set_ylabel('Open Avg Gradient', color='tab:purple')
    ax1.tick_params(axis='y', labelcolor='tab:purple')

    # Frequency gradient를 계산하여 그래프로 표시합니다.
    gradient = np.gradient(merged_df['Total Frequency'])
    ax2 = ax1.twinx()
    ax2.plot(merged_df['Timestamp'], gradient - np.mean(gradient), color='tab:orange', linestyle='--', label='Frequency Gradient')
    ax2.set_ylabel('Frequency Gradient', color='tab:orange')
    ax2.tick_params(axis='y', labelcolor='tab:orange')

    # Volume gradient를 계산하여 그래프로 표시합니다.
    volume_gradient = np.gradient(merged_df['volume'])
    ax3 = ax1.twinx()
    ax3.spines['right'].set_position(('outward', 120))
    ax3.plot(merged_df['Timestamp'], volume_gradient - np.mean(volume_gradient), color='tab:cyan', linestyle='--', label='Volume Gradient')
    ax3.set_ylabel('Volume Gradient', color='tab:cyan')
    ax3.tick_params(axis='y', labelcolor='tab:cyan')

    # y축의 0 위치를 동일하게 조정합니다.
    ax1.set_ylim(-max(abs(open_avg_gradient)), max(abs(open_avg_gradient)))
    ax2.set_ylim(-max(abs(gradient)), max(abs(gradient)))
    ax3.set_ylim(-max(abs(volume_gradient)), max(abs(volume_gradient)))

    fig.tight_layout()
    fig.legend(loc='upper left')
    plt.title('Open Avg Gradient, Frequency Gradient, and Volume Gradient over Time')
    plt.show()


# 함수 호출
plot_gradients_with_spy(topics_over_time, daily_data)


### Gradient(open avg로 모든 날의 gradient를 생략없이 합산)

In [None]:

# Timestamp를 datetime 형식으로 변환
spy_data['Timestamp'] = pd.to_datetime(spy_data['timestamp'])

# 날짜별로 그룹화하여 필요한 값 계산
spy_data['Date'] = spy_data['Timestamp'].dt.date
daily_data = spy_data.groupby('Date').agg({
    'open': ['first', 'last'],
    'high': 'max',
    'low': 'min',
    'volume': 'sum'
})

# 컬럼명 정리
daily_data.columns = ['open_first', 'open_last', 'high', 'low', 'volume']
daily_data['open_avg'] = daily_data[['open_first', 'open_last']].mean(axis=1)

# 'difference' 컬럼 추가: 가장 이른 시간 open과 가장 늦은 시간 close의 차이(절댓값)
daily_data['difference'] = abs(daily_data['open_first'] - daily_data['open_last'])

# 날짜별 SPY 데이터가 없는 경우 가장 가까운 날짜의 데이터를 찾는 함수
def find_closest_date(target_date, date_set):
    while target_date not in date_set:
        target_date -= timedelta(days=1)
    return target_date

# 토픽 빈도 데이터와 날짜별 difference 데이터 병합 및 플롯
def plot_gradients_with_spy(topics_over_time, daily_data, top_n_topics=20):
    plt.figure(figsize=(14, 8))
    
    # 토픽별 빈도를 저장할 딕셔너리 생성
    topic_frequencies = {}

    # 상위 n개의 토픽만을 사용하여 빈도를 더합니다.
    for topic_id in range(1, top_n_topics + 1):
        topic_data = topics_over_time[topics_over_time['Topic'] == topic_id]
        timestamps = topic_data['Timestamp']
        frequencies = topic_data['Frequency']
        
        # 각 타임스탬프에서의 빈도를 더합니다.
        for timestamp, frequency in zip(timestamps, frequencies):
            if timestamp in topic_frequencies:
                topic_frequencies[timestamp] += frequency
            else:
                topic_frequencies[timestamp] = frequency

    # 딕셔너리를 데이터프레임으로 변환하여 그래프를 그립니다.
    df = pd.DataFrame(list(topic_frequencies.items()), columns=['Timestamp', 'Total Frequency'])
    df = df.sort_values(by='Timestamp')
    
    # 날짜별로 정리된 SPY 데이터와 병합
    df['Date'] = pd.to_datetime(df['Timestamp']).dt.date
    date_set = set(daily_data.index)
    df['Closest Date'] = df['Date'].apply(lambda x: find_closest_date(x, date_set))
    
    merged_df = pd.merge(df, daily_data, left_on='Closest Date', right_index=True, how='left')

    fig, ax1 = plt.subplots(figsize=(14, 8))

    # open_avg_gradient를 계산하여 그래프로 표시합니다.
    open_avg_gradient = np.zeros(len(merged_df))
    for i in range(len(merged_df)):
        start_idx = max(0, i - 4)
        end_idx = min(len(merged_df), i + 5)
        available_data_count = end_idx - start_idx
        if available_data_count < 9:
            scaling_factor = 9 / available_data_count
            open_avg_gradient[i] = scaling_factor * np.sum(merged_df['difference'][start_idx:end_idx])
        else:
            open_avg_gradient[i] = np.sum(merged_df['difference'][start_idx:end_idx])

    ax1.plot(merged_df['Timestamp'], open_avg_gradient, color='tab:purple', linestyle='--', label='Open Avg Gradient')
    ax1.set_xlabel('Timestamp')
    ax1.set_ylabel('Open Avg Gradient', color='tab:purple')
    ax1.tick_params(axis='y', labelcolor='tab:purple')

    # Frequency gradient를 계산하여 그래프로 표시합니다.
    gradient = np.gradient(merged_df['Total Frequency'])
    ax2 = ax1.twinx()
    ax2.plot(merged_df['Timestamp'], gradient, color='tab:orange', linestyle='--', label='Frequency Gradient')
    ax2.set_ylabel('Frequency Gradient', color='tab:orange')
    ax2.tick_params(axis='y', labelcolor='tab:orange')

    # Volume gradient를 계산하여 그래프로 표시합니다.
    volume_gradient = np.gradient(merged_df['volume'])
    ax3 = ax1.twinx()
    ax3.spines['right'].set_position(('outward', 120))
    ax3.plot(merged_df['Timestamp'], volume_gradient, color='tab:cyan', linestyle='--', label='Volume Gradient')
    ax3.set_ylabel('Volume Gradient', color='tab:cyan')
    ax3.tick_params(axis='y', labelcolor='tab:cyan')

    fig.tight_layout()
    fig.legend(loc='upper left')
    plt.title('Open Avg Gradient, Frequency Gradient, and Volume Gradient over Time')
    plt.show()

# 함수 호출
plot_gradients_with_spy(topics_over_time, daily_data)
