In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import grangercausalitytests
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import plotly.express as px

import sys
sys.path.append('../')
from utils import get_comments2sentiments_per_minutes, bitcoin_sentiment_scatter_norm

In [2]:
bitcoin_data = pd.read_csv('bitcoin_data.csv')
bitcoin_data['bitcoin_norm'] = (bitcoin_data['bitcoin'] - bitcoin_data['bitcoin'].min()) / (bitcoin_data['bitcoin'].max() - bitcoin_data['bitcoin'].min())
comments_data = pd.read_csv('bitcoin_reddit_comments.csv')
sentiments = get_comments2sentiments_per_minutes(comments_data, minutes=10)
sentiments['compound_norm'] = (sentiments['compound_mean'] + 1 ) * 0.5

# Merge sentiments with bitcoin_data on the closest date
sentiments['date'] = pd.to_datetime(sentiments['date'])
bitcoin_data['date'] = pd.to_datetime(bitcoin_data['date'])

# Find the closest date in bitcoin_data for each date in sentiments
sentiments['closest_date'] = sentiments['date'].apply(lambda x: bitcoin_data.iloc[(bitcoin_data['date'] - x).abs().argsort()[:1]]['date'].values[0])

# Merge the dataframes on the closest date
sentiments = sentiments.merge(bitcoin_data[['date', 'bitcoin_norm']], left_on='closest_date', right_on='date', suffixes=('', '_bitcoin'))

# Drop the extra date columns
sentiments.drop(columns=['closest_date', 'date_bitcoin'], inplace=True)

In [3]:
comments_data.head()

Unnamed: 0,title,body,author,url,created_utc,upvotes,type,comments,neg,neu,pos,compound
0,OKX Referral Code : FEEKICKBACK 🤍 (15% Fee Dis...,OKX Referral Code : FEEKICKBACK\n\nRegister wi...,Ok_Conflict9643,https://www.reddit.com/r/referralcodes/comment...,2024-12-04 14:08:16,1,title,0.0,0.147,0.54,0.313,0.9538
1,"""Unlocking Value: Saylor on $MSFT 💰🔑""",. 🚨 BREAKING: 🇺🇸 Michael Saylor thinks Microso...,Warm-Echo5540,https://www.reddit.com/r/BitunixCrypto/comment...,2024-12-04 14:05:47,1,title,0.0,0.0,0.8,0.2,0.7901
2,Trezor vs Coldcard (x-post from /r/Bitcoin),,ASICmachine,https://www.reddit.com/r/Bitcoin/comments/1h6g...,2024-12-04 14:04:20,1,title,0.0,0.0,1.0,0.0,0.0
3,Paul Atkins as new SEC Chair!! (x-post from /r...,,ASICmachine,https://www.reddit.com/r/Bitcoin/comments/1h6g...,2024-12-04 14:04:20,1,title,0.0,0.0,1.0,0.0,0.0
4,LazyLambo (x-post from /r/Bitcoin),,ASICmachine,https://www.reddit.com/r/Bitcoin/comments/1h6g...,2024-12-04 14:04:20,1,title,0.0,0.0,1.0,0.0,0.0


In [4]:
bitcoin_data[['currency', 'bitcoin', 'date']].head()

Unnamed: 0,currency,bitcoin,date
0,usd,95539,2024-12-04 14:00:06
1,usd,95692,2024-12-04 14:10:06
2,usd,95640,2024-12-04 14:20:07
3,usd,95799,2024-12-04 14:30:06
4,usd,96340,2024-12-04 14:40:06


In [5]:
# Ensure 'created_utc' is in datetime format
comments_data['created_utc'] = pd.to_datetime(comments_data['created_utc'])

# Group by hour and calculate the mean number of comments
comments_per_hour = comments_data.set_index('created_utc').resample('H')['comments'].mean()

# Compute the average number of comments per hour
average_comments_per_hour = comments_per_hour.mean()

print(f"Average number of comments per hour: {average_comments_per_hour}")

Average number of comments per hour: 0.4725352805765616


  comments_per_hour = comments_data.set_index('created_utc').resample('H')['comments'].mean()


In [8]:
fig = px.scatter(sentiments, x='compound_norm', y='bitcoin_norm', title='Bitcoin price vs Sentiment', labels={'compound_norm': 'Sentiment', 'bitcoin_norm': 'Bitcoin price'})
fig.show()
# fig.update_traces(marker=dict(size=3))

In [9]:
fig = px.scatter(sentiments, x='date', y=['bitcoin_norm', 'compound_norm'], title='Bitcoin price vs Sentiment', labels={'compound_norm': 'Sentiment', 'bitcoin_norm': 'Bitcoin price'})
fig.update_traces(mode='lines')
fig.show()


In [10]:
sentiments['bitcoin_norm'].corr(sentiments['compound_norm'], method='pearson')

-0.03323810276784893

In [11]:
sentiments['bitcoin_norm'].corr(sentiments['compound_norm'], method='spearman')

-0.03438086852584593

In [12]:
sentiments['compound_norm'].corr(sentiments['bitcoin_norm'], method='spearman')

-0.034380868525845924

In [13]:
sentiments['compound_norm'].shift(15).corr(sentiments['bitcoin_norm'], method='pearson')

-0.012260883494005639

In [14]:
granger_results = grangercausalitytests(sentiments[['bitcoin_norm', 'compound_norm']], maxlag=30)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=1.4705  , p=0.2264  , df_denom=250, df_num=1
ssr based chi2 test:   chi2=1.4882  , p=0.2225  , df=1
likelihood ratio test: chi2=1.4838  , p=0.2232  , df=1
parameter F test:         F=1.4705  , p=0.2264  , df_denom=250, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.7669  , p=0.4656  , df_denom=247, df_num=2
ssr based chi2 test:   chi2=1.5648  , p=0.4573  , df=2
likelihood ratio test: chi2=1.5600  , p=0.4584  , df=2
parameter F test:         F=0.7669  , p=0.4656  , df_denom=247, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=2.9098  , p=0.0352  , df_denom=244, df_num=3
ssr based chi2 test:   chi2=8.9798  , p=0.0296  , df=3
likelihood ratio test: chi2=8.8229  , p=0.0317  , df=3
parameter F test:         F=2.9098  , p=0.0352  , df_denom=244, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=2.1602  , p=0.0741  

In [15]:
sentiments['compound_norm_shifted'] = sentiments['compound_norm'].shift(15)
fig = px.scatter(sentiments, x='date', y=['bitcoin_norm', 'compound_norm_shifted'], title='Bitcoin price vs Shifted Sentiment', labels={'compound_norm_shifted': 'Shifted Sentiment', 'bitcoin_norm': 'Bitcoin price'})
fig.update_traces(mode='lines')
fig.show()

In [19]:
# Group by day and sum the 'people' column
sentiments['people'] = sentiments[['positive_count','negative_count', 'neutral_count']].sum(axis=1)
daily_people = sentiments.groupby(sentiments['date'].dt.date)['people'].sum().reset_index()
# Plot the bar chart
fig = px.bar(daily_people, x='date', y='people', title='People per Day', color_discrete_sequence=['orange'], text='people')
fig.show()

In [22]:
sentiments['people_norm'] = (sentiments['people'] - sentiments['people'].min()) / (sentiments['people'].max() - sentiments['people'].min())
fig = px.scatter(sentiments, x='date', y=['bitcoin_norm', 'people_norm'], title='Bitcoin price vs Shifted Sentiment', labels={'compound_norm_shifted': 'Shifted Sentiment', 'bitcoin_norm': 'Bitcoin price'})
fig.update_traces(mode='lines')
fig.show()

In [None]:
import statsmodels.api as sm

model_with_people = sm.OLS()

In [None]:
px.scatter(sentiments, x='date', y=['people', 'bitcoin_norm'], title='People per Day')

In [47]:
# sentiments['compound_norm_shifted'].iloc[:20]

In [None]:
# Calculate Pearson and Spearman correlation
bitcoin_norm = bitcoin_data['bitcoin_norm']
correlation_pearson = bitcoin_norm.corr(sentiment_norm, method='pearson')
correlation_spearman = bitcoin_norm.corr(sentiment_norm, method='spearman')


In [54]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [60]:
X = sentiments['compound_norm']
y = sentiments['bitcoin_norm']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X = sm.add_constant(X)
X_train = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train).fit()

In [None]:
model.summary() 

In [None]:
# sentiments_shifted = sentiments.copy()
# sentiments_shifted.dropna(inplace=True)
# X = sentiments['compound_norm_shifted']
# y = sentiments_shifted['bitcoin_norm']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Assume `data` is your DataFrame with a time series index
train_size = int(len(sentiments) * 0.8)  # 80% for training, 20% for testing
train, test = sentiments[:train_size], sentiments[train_size:]
X_train = train['compound_norm']
y_train = train['bitcoin_norm'] 
X_test = test['compound_norm']
y_test = test['bitcoin_norm']


X_train_with_constant = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_with_constant).fit()
model.summary()

In [None]:
train_size = int(len(sentiments) * 0.8)  # 80% for training, 20% for testing
train, test = sentiments[:train_size], sentiments[train_size:]

X_train = train[['compound_norm', 'people']]
y_train = train['bitcoin_norm'] 
X_test = test[['compound_norm', 'people']]
y_test = test['bitcoin_norm']


X_train_with_constant = sm.add_constant(X_train)
modelp = sm.OLS(y_train, X_train_with_constant).fit()
modelp.summary()

In [75]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# X_train_reshaped = X_train.values.reshape(-1, 1)
rfr = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=5).fit(X_train, y_train)
rfr.score(X_train, y_train)

In [None]:
rfr.feature_importances_

In [None]:
rfr.score(X_test, y_test)

In [160]:
gbitcoin = pd.read_csv('bitcoin_google_search.csv')#['Category: All categories']
gbitcoin.rename(columns={'bitcoin': 'bitcoin_search'}, inplace=True)

In [None]:
# Convert 'Time' in gbitcoin to datetime if not already done
gbitcoin['Time'] = pd.to_datetime(gbitcoin['Time'])

# Find the closest date in sentiments for each Time in gbitcoin
gbitcoin['closest_date'] = gbitcoin['Time'].apply(lambda x: sentiments.iloc[(sentiments['date'] - x).abs().argsort()[:1]]['date'].values[0])

# Merge the dataframes on the closest date
merged_data = pd.merge_asof(gbitcoin.sort_values('closest_date'), sentiments.sort_values('date'), left_on='closest_date', right_on='date', direction='nearest')

# Drop the extra date columns if necessary
merged_data.drop(columns=['closest_date', 'date'], inplace=True)

print(merged_data)
# Forward fill missing values in the merged data
merged_data.ffill(inplace=True)

print(merged_data)

In [None]:
# Convert 'Time' in gbitcoin to datetime
gbitcoin['Time'] = pd.to_datetime(gbitcoin['Time'])

# Find the closest date in sentiments for each Time in gbitcoin
gbitcoin['closest_date'] = gbitcoin['Time'].apply(lambda x: sentiments.iloc[(sentiments['date'] - x).abs().argsort()[:1]]['date'].values[0])
# Forward fill missing values in the merged data
merged_data.ffill(inplace=True)
# Merge the dataframes on the closest date
merged_data = gbitcoin.merge(sentiments, left_on='closest_date', right_on='date', suffixes=('_gbitcoin', '_sentiments'))

# Drop the extra date columns
merged_data.drop(columns=['closest_date', 'date'], inplace=True)

print(merged_data)

In [27]:
gsearch = pd.read_csv('bitcoin_google_search.csv')
# Convert 'Time' in gsearch to datetime if not already done
gsearch['Time'] = pd.to_datetime(gsearch['Time'])

# Find the closest date in sentiments for each Time in gsearch
gsearch['closest_date'] = gsearch['Time'].apply(lambda x: sentiments.iloc[(sentiments['date'] - x).abs().argsort()[:1]]['date'].values[0])

# Merge the dataframes on the closest date
merged_gsearch_sentiments = pd.merge_asof(gsearch.sort_values('closest_date'), sentiments.sort_values('date'), left_on='closest_date', right_on='date', direction='nearest')

# Drop the extra date columns if necessary
merged_gsearch_sentiments.drop(columns=['closest_date', 'date'], inplace=True)

merged_gsearch_sentiments

Unnamed: 0,Time,bitcoin,positive_count,negative_count,neutral_count,compound_mean,compound_norm,bitcoin_norm,compound_norm_shifted,people,people_norm
0,2024-12-04 23:00:00,28,1,0,0,0.961000,0.980500,0.485444,,1,0.000000
1,2024-12-05 00:00:00,23,1,0,0,0.961000,0.980500,0.485444,,1,0.000000
2,2024-12-05 01:00:00,21,0,1,0,-0.897000,0.051500,0.448000,,1,0.000000
3,2024-12-05 02:00:00,48,1,0,0,0.361200,0.680600,0.458444,,1,0.000000
4,2024-12-05 03:00:00,100,1,0,0,0.361200,0.680600,0.458444,,1,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
164,2024-12-11 15:00:00,24,1,0,2,0.106067,0.553033,0.433444,0.390233,3,0.030769
165,2024-12-11 14:00:00,21,1,0,2,0.106067,0.553033,0.433444,0.390233,3,0.030769
166,2024-12-11 22:00:00,29,1,0,2,0.106067,0.553033,0.433444,0.390233,3,0.030769
167,2024-12-11 17:00:00,25,1,0,2,0.106067,0.553033,0.433444,0.390233,3,0.030769


Unnamed: 0,positive_count,negative_count,neutral_count,compound_mean,date,compound_norm,bitcoin_norm,compound_norm_shifted,people,people_norm
0,5,0,4,0.322856,2024-12-04 14:10:00,0.661428,0.124111,,9,0.123077
1,2,1,2,0.132160,2024-12-04 15:20:00,0.566080,0.144556,,5,0.061538
2,1,0,0,0.531900,2024-12-04 16:10:00,0.765950,0.110333,,1,0.000000
3,4,3,3,0.132160,2024-12-04 16:30:00,0.566080,0.059667,,10,0.138462
4,1,0,0,0.334600,2024-12-04 17:20:00,0.667300,0.046111,,1,0.000000
...,...,...,...,...,...,...,...,...,...,...
249,8,1,2,0.484609,2024-12-11 11:10:00,0.742305,0.411222,0.362250,11,0.153846
250,0,1,0,-0.381800,2024-12-11 11:20:00,0.309100,0.429000,0.545388,1,0.000000
251,2,0,4,0.157267,2024-12-11 11:40:00,0.578633,0.419556,0.648000,6,0.076923
252,2,0,3,0.258900,2024-12-11 12:10:00,0.629450,0.424667,0.615740,5,0.061538


In [None]:
gbitcoin

In [None]:
sentiments

In [None]:
# Mock data creation for Bitcoin Norm and Sentiment Norm (replace with actual data)
np.random.seed(42)
dates = pd.date_range(start="2024-12-05", end="2024-12-11", freq='H')
bitcoin_norm = pd.Series(np.sin(np.linspace(0, 10, len(dates))) + np.random.normal(0, 0.1, len(dates)), index=dates)
sentiment_norm = pd.Series(np.cos(np.linspace(0, 10, len(dates))) + np.random.normal(0, 0.1, len(dates)), index=dates)

# Calculate Pearson and Spearman correlation
correlation_pearson = bitcoin_norm.corr(sentiment_norm, method='pearson')
correlation_spearman = bitcoin_norm.corr(sentiment_norm, method='spearman')

# Print Correlation results
print(f"Pearson Correlation: {correlation_pearson}")
print(f"Spearman Correlation: {correlation_spearman}")

# Lag Analysis
lagged_correlations = {}
for lag in range(-10, 11):  # Lags from -10 to +10
    shifted_sentiment = sentiment_norm.shift(lag)
    correlation = bitcoin_norm.corr(shifted_sentiment)
    lagged_correlations[lag] = correlation

# Plot lagged correlations
plt.figure(figsize=(10, 6))
plt.plot(list(lagged_correlations.keys()), list(lagged_correlations.values()), marker='o')
plt.axhline(0, color='gray', linestyle='--')
plt.title('Lagged Correlation between Bitcoin Norm and Sentiment Norm')
plt.xlabel('Lag')
plt.ylabel('Correlation')
plt.show()

# Granger Causality Test
data = pd.concat([bitcoin_norm, sentiment_norm], axis=1)
data.columns = ['Bitcoin', 'Sentiment']
granger_results = grangercausalitytests(data.dropna(), maxlag=10)

# Linear Regression Analysis
X = sentiment_norm.values.reshape(-1, 1)
y = bitcoin_norm.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

# Regression Results
print(f"Regression Coefficient: {model.coef_[0]}")
print(f"Intercept: {model.intercept_}")

# Plotting Regression
plt.figure(figsize=(10, 6))
plt.scatter(sentiment_norm, bitcoin_norm, alpha=0.5, label="Data Points")
plt.plot(sentiment_norm, model.predict(X), color='red', label="Regression Line")
plt.title("Regression Analysis: Bitcoin Norm vs Sentiment Norm")
plt.xlabel("Sentiment Norm")
plt.ylabel("Bitcoin Norm")
plt.legend()
plt.show()
