# Media analysis

### Notebook description

This notebook aims to analyze the correlation between all the media of other notebooks and Bitcoin price over years.

### Data overview

The dataset used in the notebook's charts is the result of a merge of numerous public datasets and automated crawling, those datasets' sources are listed in the README file.

Each content has been classified using fast-classifier from Flair framework (read the README for details about the license).

## Basic Analysis

To achieve uniformity between charts, a project-wise colors palette has been used

In [1]:
from palette import palette
base_path = './Datasets/'

In [2]:
news_daily_csv = base_path + 'news_daily_info.csv'
reddit_daily_csv = base_path + 'reddit_daily_info.csv'
twitter_daily_csv = base_path + 'twitter_daily_info.csv'
telegram_daily_csv = base_path + 'telegram_daily_info.csv'

In [3]:
import pandas as pd

news_raw_df = pd.read_csv(news_daily_csv)
reddit_raw_df = pd.read_csv(reddit_daily_csv)
twitter_raw_df = pd.read_csv(twitter_daily_csv)
telegram_raw_df = pd.read_csv(telegram_daily_csv)

dfs = [news_raw_df, reddit_raw_df, twitter_raw_df, telegram_raw_df]
_ = [df.dropna(inplace=True) for df in dfs]

In [4]:
def avg_sentiment(group) -> float:
    total = sum(group['count'])
    sentiment = sum(group['count'] * group['signed_score'])/total
    return sentiment

def score_to_label(score) -> str:
    if score == 0:
        return 'NEUTRAL'
    return 'POSITIVE' if score > 0 else 'NEGATIVE'

def normalize(value: float, range_min: float, range_max: float) -> float:
    return (value-range_min)/(range_max-range_min)

def normalize_series(series, series_min=None, series_max=None) -> pd.Series:
    if series_min is None:
        series_min = min(series)
        
    if series_max is None:
        series_max = max(series)
    return series.apply(lambda x: normalize(x, series_min, series_max))

In [5]:
for df in dfs:
    df['label'] = df['label'].apply(lambda x: x.replace('"', ''))
    df['signed_score'] = df['conf'] * df['label'].apply(lambda x: 1 if x == 'POSITIVE' else -1)

Common dates range is calculated intersecting the market dataframe with news one

In [6]:
market_daily_csv = base_path + '/market_daily_info.csv'
market_dates = pd.read_csv(market_daily_csv).dropna()['date']

dates_min = max([min(market_dates), max([min(df['date']) for df in dfs])])
dates_max = min([max(market_dates), min([max(df['date']) for df in dfs])])

dates = market_dates

for df in dfs:
    dates = pd.concat([dates, df['date']])
    
dates = dates.drop_duplicates().sort_values()
dates = dates[(dates_min <= dates) & (dates <= dates_max)]


In [7]:
news_raw_df = news_raw_df[(news_raw_df['date'] >= dates_min) & (news_raw_df['date'] <= dates_max)]
reddit_raw_df = reddit_raw_df[(reddit_raw_df['date'] >= dates_min) & (reddit_raw_df['date'] <= dates_max)]
twitter_raw_df = twitter_raw_df[(twitter_raw_df['date'] >= dates_min) & (twitter_raw_df['date'] <= dates_max)]
telegram_raw_df = telegram_raw_df[(telegram_raw_df['date'] >= dates_min) & (telegram_raw_df['date'] <= dates_max)]

In [8]:
market_daily_csv = base_path+ 'market_daily_info.csv'
market_raw_df = pd.read_csv(market_daily_csv)
market_raw_df = market_raw_df.dropna()
market_df = pd.DataFrame(dates, columns=['date'])
market_df = market_df.merge(market_raw_df, on='date')
market_df['mid_price'] = (market_df['high'] + market_df['low'])/2
market_df['norm_mid_price'] = normalize_series(market_df['mid_price'])
market_df = market_df[(dates_min <= market_df['date']) & (market_df['date']<= dates_max)]

In [9]:
def to_wide(df: pd.DataFrame) -> df:
    date_grouped = df.groupby('date')
    temp = pd.DataFrame(index=df['date'].drop_duplicates())
    temp['sentiment'] = date_grouped.apply(avg_sentiment)
    temp['normalized_sentiment'] = normalize_series(temp['sentiment'], -1, 1)
    temp['label'] = temp['sentiment'].apply(score_to_label)
    temp['count'] = date_grouped.apply(lambda x: sum(x['count']))
    temp['normalized_count'] = normalize_series(temp['count'], 0)
    negatives = df[df['label'] == 'NEGATIVE'][['date', 'count']]
    negatives.columns= ['date', 'negatives']
    positives = df[df['label'] == 'POSITIVE'][['date', 'count']]
    positives.columns= ['date', 'positives']
    temp = temp.merge(negatives, on='date')
    temp = temp.merge(positives, on='date')
    return temp

                                       
news_wide = to_wide(news_raw_df)
reddit_wide = to_wide(reddit_raw_df)
twitter_wide = to_wide(twitter_raw_df)
telegram_wide = to_wide(telegram_raw_df)

In [10]:
import altair as alt

alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

In [11]:
summary = pd.DataFrame(dates, columns=['date'])

news_summary = news_wide[['date', 'sentiment', 'normalized_count', 'count']]
news_summary.columns = ['date', 'news_sentiment', 'news_normalized_count', 'news_count']

reddit_summary = reddit_wide[['date', 'sentiment', 'normalized_count', 'count']]
reddit_summary.columns = ['date', 'reddit_sentiment', 'reddit_normalized_count', 'reddit_count']

twitter_summary = twitter_wide[['date', 'sentiment', 'normalized_count', 'count']]
twitter_summary.columns = ['date', 'twitter_sentiment', 'twitter_normalized_count', 'twitter_count']

telegram_summary = telegram_wide[['date', 'sentiment', 'normalized_count', 'count']]
telegram_summary.columns = ['date', 'telegram_sentiment', 'telegram_normalized_count', 'telegram_count']

summary = summary.merge(news_summary, on='date')
summary = summary.merge(reddit_summary, on='date')
summary = summary.merge(twitter_summary, on='date')
summary = summary.merge(telegram_summary, on='date')
summary = summary.fillna(0).sort_values(by='date')
summary = summary[summary['twitter_count'] < 100000] #remove twitter outlier (spammers)
summary['sentiment'] = (summary['news_sentiment'] + summary['reddit_sentiment'] + summary['twitter_sentiment'] + summary['telegram_sentiment'])/4
summary['norm_sent'] = normalize_series(summary['sentiment'], -1, 1)
summary['label'] = summary['sentiment'].apply(lambda x: 'POSITIVE' if x>0 else 'NEGATIVE')
summary['count'] = summary['news_count'] + summary['reddit_count'] + summary['twitter_count'] + summary['telegram_count']
summary = summary.dropna()
market_df = market_df[(market_df['date'] >= min(summary['date'])) & (market_df['date'] <= max(summary['date']))]

### Weekly volume

In [12]:
news_summary.loc[:,'month'] = news_summary.loc[:,'date'].apply(lambda x: x[:-3])
reddit_summary.loc[:,'month'] = reddit_summary.loc[:,'date'].apply(lambda x: x[:-3])
twitter_summary.loc[:,'month'] = twitter_summary.loc[:,'date'].apply(lambda x: x[:-3])
telegram_summary.loc[:,'month'] = telegram_summary.loc[:,'date'].apply(lambda x: x[:-3])

news_weekly_volume = news_summary[['month', 'news_count']].groupby(by='month', as_index=False).mean()
reddit_weekly_volume = reddit_summary[['month', 'reddit_count']].groupby(by='month', as_index=False).mean()
twitter_weekly_volume = twitter_summary[['month', 'twitter_count']].groupby(by='month', as_index=False).mean()
telegram_weekly_volume = telegram_summary[['month', 'telegram_count']].groupby(by='month', as_index=False).mean()

plot_title = alt.TitleParams('News weekly volume', subtitle='Average volume per week')
news_chart = alt.Chart(news_weekly_volume, title=plot_title).mark_area().encode(alt.X('yearmonth(month):T', title='Date'),
                                                                                 alt.Y('news_count', title='Volume'),
                                                                                 color=alt.value(palette['news']))

plot_title = alt.TitleParams('Reddit weekly volume', subtitle='Average volume per week')
reddit_chart = alt.Chart(reddit_weekly_volume, title=plot_title).mark_area().encode(alt.X('yearmonth(month):T', title='Date'),
                                                                                 alt.Y('reddit_count', title='Volume'),
                                                                                 color=alt.value(palette['reddit']))

plot_title = alt.TitleParams('Twitter weekly volume', subtitle='Average volume per week')
twitter_chart = alt.Chart(twitter_weekly_volume, title=plot_title).mark_area().encode(alt.X('yearmonth(month):T', title='Date'),
                                                                                 alt.Y('twitter_count', title='Volume'),
                                                                                 color=alt.value(palette['twitter']))

plot_title = alt.TitleParams('Telegram weekly volume', subtitle='Average volume per week')
telegram_chart = alt.Chart(telegram_weekly_volume, title=plot_title).mark_area().encode(alt.X('yearmonth(month):T', title='Date'),
                                                                                 alt.Y('telegram_count', title='Volume'),
                                                                                 color=alt.value(palette['telegram']))

(news_chart | reddit_chart) & (twitter_chart | telegram_chart)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


To find which time ranges have fewer data, it's better tu use a staked chart

In [13]:
long_weekly_volume = news_weekly_volume
long_weekly_volume = long_weekly_volume.merge(reddit_weekly_volume, on='month', how='outer')
long_weekly_volume = long_weekly_volume.merge(twitter_weekly_volume, on='month', how='outer')
long_weekly_volume = long_weekly_volume.merge(telegram_weekly_volume, on='month', how='outer')
long_weekly_volume = long_weekly_volume.dropna()
long_weekly_volume.columns = ['month', 'news', 'Reddit', 'Twitter', 'Telegram']
long_weekly_volume = long_weekly_volume.melt('month', value_name='volume', var_name='Media')

media = ['news', 'Reddit', 'Twitter', 'Telegram']

colors = alt.Color('Media', scale=alt.Scale(domain=media, 
                                            range=[palette[media.lower()] for media in media]))

legend_selector = selection = alt.selection_multi(fields=['Media'], bind='legend')

plot_title = alt.TitleParams('Weekly volume', subtitle='Average volume per week')
stacked = alt.Chart(long_weekly_volume, title=plot_title).mark_area().encode(
    x=alt.X('yearmonth(month):T', title='Date'),
    y=alt.Y('volume', title='Volume'),
    color=colors,
    opacity=alt.condition(legend_selector, alt.value(1), alt.value(0.01)),
).add_selection(legend_selector)



stacked

Since news volume is on another scale compared to others media, it's not visible in the chart.

### Basic data exploration

#### Sentiment

In [14]:
sent_rounded = summary[['norm_sent']].copy()
sent_rounded['norm_sent'] = sent_rounded['norm_sent'].apply(lambda x: round(x, 2))

alt.Chart(sent_rounded, title='Sentiment dispersion').mark_boxplot(color=palette['neutral_1']).encode(alt.X('norm_sent', title='Normalized sentiment')).properties(height=200)

Merging all the media this IQR is larger.

In [15]:
from scipy.stats import norm
import numpy as np

x = np.linspace(-5, 5, 1000)
df = pd.DataFrame({'x': x, 'y': norm.pdf(x)})
normal_distribution = alt.Chart(df).mark_line(color=palette['smooth_neutral']).encode(alt.X('x', title=None, axis=None),
                                                      alt.Y('y', title=None, axis=None))

sent_dist = sent_rounded.groupby('norm_sent', as_index=False).size()
sent_dist.columns = ['norm_sent', 'count']

(alt.Chart(sent_dist, title='Sentiment distribution').mark_area().encode(alt.X('norm_sent', title='Normalized sentiment'), alt.Y('count', title='Count'), color=alt.value(palette['neutral_1'])) + normal_distribution).resolve_scale(x='independent', y='independent')

The sentiment distribution is really similar to a normal distribution, so the Pearson correlation is appliable.

#### Volume

In [16]:
alt.Chart(summary, title='Volume dispersion').mark_boxplot(color=palette['neutral_1']).encode(alt.X('count', title=None)).properties(height=200)

Having an IQR far from 0, there are few days without data.

In [17]:
volumes = summary[['count']]

volumes_dist = volumes.groupby('count', as_index=False).size()
volumes_dist.columns = ['volume', 'count']

alt.Chart(volumes_dist, title='Volume distribution').mark_area().encode(alt.X('volume', title='Volume'), alt.Y('count', title='Count'), color=alt.value(palette['neutral_1']))

## Sentiment analysis

In [18]:
domain = [0, 1]
color_range = [palette['negative'], palette['positive']]

time_selector = alt.selection(type='interval', encodings=['x'])

gradient = alt.Color('norm_sent', scale=alt.Scale(domain=domain, range=color_range), title='Normalized sentiment')

price_chart = alt.Chart(market_df).mark_line(color=palette['strong_price']).encode(
    x=alt.X('yearmonthdate(date):T',
           scale=alt.Scale(domain=time_selector),
           title=None),
    y=alt.Y('mid_price', title='Mid price')
)
                      
plot_title = alt.TitleParams('Normalized sentiment vs Bitcoin price', subtitle='0:= negative, 1:= positive')
histogram = alt.Chart(summary, title=plot_title).mark_bar().encode(alt.X('yearmonthdate(date):T',
                                                       bin=alt.Bin(maxbins=100, extent=time_selector),
                                                       scale=alt.Scale(domain=time_selector),
                                                       axis=alt.Axis(labelOverlap='greedy', labelSeparation=6)),
                                                 alt.Y('norm_sent',
                                                      scale=alt.Scale(domain=[0,1]),
                                                      title='Normalized sentiment'),
                                                 color=gradient)
dummy_df = summary[['date']].copy()
dummy_df['value'] = 0.5

dummy_chart = alt.Chart(dummy_df).mark_line(color=palette['smooth_neutral']).encode(alt.X('yearmonthdate(date):T',
                                                       scale=alt.Scale(domain=time_selector),
                                                       axis=None),
                                                 alt.Y('value',
                                                      scale=alt.Scale(domain=[0,1]),
                                                      axis=None))


selection_plot = alt.Chart(summary).mark_bar().encode(alt.X('yearmonthdate(date):T',
                                                       bin=alt.Bin(maxbins=100),
                                                            title='Date',
                                                            axis=alt.Axis(labelOverlap='greedy', labelSeparation=6)),
                                                       alt.Y('norm_sent', title=None),
                                                       color=gradient).add_selection(time_selector).properties(height=50)

((histogram + dummy_chart + price_chart).resolve_scale(y='independent') & selection_plot).configure_axisRight(titleColor=palette['strong_price'])

Bar binning has some problems plotting the sentiment if it's in [-1, 1], for that reason the interactive version uses normalized sentiment and the static one uses original sentiment values.

In [19]:
domain = [-1, 1]

gradient = alt.Color('sentiment', scale=alt.Scale(domain=domain, range=color_range), title='Sentiment')

dummy_df = summary[['date']].copy()
dummy_df['value'] = 0

dummy_chart = alt.Chart(dummy_df).mark_line(color=palette['smooth_neutral']).encode(alt.X('yearmonthdate(date):T',
                                                       scale=alt.Scale(domain=time_selector)),
                                                 alt.Y('value',
                                                      scale=alt.Scale(domain=[-1,1]),
                                                      axis=None))

price_chart = alt.Chart(market_df).mark_line(color=palette['strong_price']).encode(
    x=alt.X('yearmonthdate(date):T'),
    y=alt.Y('mid_price', title='Mid price')
)

plot_title = alt.TitleParams('Static sentiment vs Bitcoin price', subtitle='-1:= negative, 1:= positive')
histogram = alt.Chart(summary, title=plot_title).mark_bar().encode(alt.X('yearmonthdate(date):T', title='Date'),
                                                 alt.Y('sentiment',
                                                      scale=alt.Scale(domain=[-1,1]),
                                                      title='Sentiment'),
                                                 color=gradient)




(histogram + dummy_chart + price_chart).resolve_scale(y='independent').configure_axisRight(titleColor=palette['strong_price'])

Plotting sentiment in [-1, 1] permit to understand immediately if price direction is the same as the sentiment one.

The general trend is towards 0, this means that the average sentiment of media is increasing.

In [20]:
summary

Unnamed: 0,date,news_sentiment,news_normalized_count,news_count,reddit_sentiment,reddit_normalized_count,reddit_count,twitter_sentiment,twitter_normalized_count,twitter_count,telegram_sentiment,telegram_normalized_count,telegram_count,sentiment,norm_sent,label,count
0,2015-10-28,0.037133,0.001838,2,-0.389571,0.061645,951,-0.803895,0.001010,331,-0.073420,0.000373,2,-0.307438,0.346281,NEGATIVE,1286
1,2015-11-03,0.058476,0.034007,37,-0.363241,0.101057,1559,-0.789405,0.001471,482,-0.002780,0.001119,6,-0.274238,0.362881,NEGATIVE,2084
2,2015-11-04,0.017482,0.034007,37,-0.409419,0.126856,1957,-0.803633,0.001718,563,-0.419466,0.000746,4,-0.403759,0.298121,NEGATIVE,2561
3,2015-11-05,-0.198915,0.040441,44,-0.420320,0.093408,1441,-0.842414,0.001923,630,-0.743590,0.001305,7,-0.551310,0.224345,NEGATIVE,2122
4,2015-11-06,-0.282049,0.031250,34,-0.348655,0.078304,1208,-0.817812,0.001431,469,0.191028,0.000373,2,-0.314372,0.342814,NEGATIVE,1713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2548,2019-11-08,-0.169603,0.030331,33,-0.357342,0.073767,1138,-0.060336,0.174412,57145,0.181239,0.024800,133,-0.101510,0.449245,NEGATIVE,58449
2549,2019-11-09,0.202578,0.036765,40,-0.409573,0.058274,899,-0.059944,0.142988,46849,0.259104,0.024986,134,-0.001959,0.499021,NEGATIVE,47922
2550,2019-11-10,0.084496,0.036765,40,-0.403994,0.056135,866,-0.109934,0.146977,48156,0.034920,0.015663,84,-0.098628,0.450686,NEGATIVE,49146
2551,2019-11-11,-0.306819,0.033088,36,-0.412823,0.065664,1013,-0.073760,0.184420,60424,-0.002829,0.027224,146,-0.199058,0.400471,NEGATIVE,61619


In [28]:
domain = [-1, 1]
range_ = ['red', 'green']

sent_summ = summary[['date', 'news_sentiment', 'reddit_sentiment', 'twitter_sentiment', 'telegram_sentiment']]

sent_summ.columns = ['date', 'news', 'Reddit', 'Twitter', 'Telegram']

sent_long_summ = sent_summ.melt('date', value_name='Sentiment', var_name='Media')

color = alt.Color('Sentiment', scale=alt.Scale(domain=domain, range=range_))

sentiment_heatmap = alt.Chart(sent_long_summ, title='Media sentiment comparison').mark_rect().encode(x=alt.X('yearmonth(date):O',
                                                                                 axis=alt.Axis(labelOverlap='greedy', labelAngle=0), title='Date'),
                                                     y=alt.Y('Media:O'),
                                                     color=color).properties(width=800)

sentiment_heatmap

As seen in the previous chart, the sentiment is increasing; but with this heatmap, we can see why.

Reddit and Telegram sentiments are slightly increasing, also the Twitter sentiment is increasing but moving from approximately -1 to values near 0.

News are the only "outliers", but it makes sense, news should reflect the truth (but, as seen in the dedicated notebook, it's not), and Bitcoin price goes up and down over time.

#### Correlation

To measure the correlation between sentiment and price, two approaches will be used:
- TLCC (Time Lagged Cross-Correlation): a measure of the correlation of the whole time series given a list of time offsets
- Windowed TLCC: the time series are lagged as in the first case, but the correlation is calculated for each window; this is useful to understand correlation "direction" (so the time series' roles) over time.

##### TLCC

In [22]:
methods = ['pearson', 'kendall', 'spearman']
offsets = list(range(-150, 151)) # list of days offset to test

correlations = []

sent_vs_price = pd.DataFrame(summary['date'], columns=['date'])
sent_vs_price['sent'] = summary['norm_sent']
sent_vs_price = sent_vs_price.merge(market_df[['date', 'mid_price']], on='date')

for method in methods:
    method_correlations = [(method, offset, sent_vs_price['sent'].corr(sent_vs_price['mid_price'].shift(-offset), method=method))
                           for offset in offsets]
    correlations.extend(method_correlations)
        
correlations_df = pd.DataFrame(correlations, columns=['method', 'offset', 'correlation'])

spearman_correlations = correlations_df[correlations_df['method'] == 'spearman']

max_corr = max(spearman_correlations['correlation'])
max_corr_offset = spearman_correlations[spearman_correlations['correlation'] == max_corr]['offset'].iloc[0]

min_corr = min(spearman_correlations['correlation'])
min_corr_offset = spearman_correlations[spearman_correlations['correlation'] == min_corr]['offset'].iloc[0]

max_corr_text = f'Max correlation ({round(max_corr, 3)}) with an offset of {max_corr_offset} days'
min_corr_text = f'Min correlation ({round(min_corr, 3)}) with an offset of {min_corr_offset} days'

plot_title = alt.TitleParams('Media sentiment correlations', subtitle=['Positive offset: looking future prices', max_corr_text, min_corr_text])
corr_chart = alt.Chart(correlations_df, title=plot_title).mark_line().encode(alt.X('offset', title='Offset days'),
                                                          alt.Y('correlation', title='Correlation'),
                                                          alt.Color('method', title='Method'))

corr_chart

In this case, Pearson correlation could be considered reliable, but its trend is similar to the others two.

Correlation is low, for this reason also the heatmap should be mostly orange/red

##### WTLCC

For semplicity, the next chart will visualiza WTLCC using Spearman correlation only.

In [23]:
from math import ceil

def get_window(series: pd.Series, window) -> pd.Series:
    return series.iloc[window[0]: window[1]]
    

def windowed_corr(first: pd.Series, second: pd.Series) -> list:
    windows = [(window * window_size, (window * window_size)+window_size) for window in range(ceil(len(second)/window_size))]
    windows_corr = [get_window(first, window).corr(get_window(second, window), method = 'spearman') for window in windows]
    return windows_corr, windows

offsets = list(range(-66, 67, 4)) # reduced offsets for better visualization
window_size = 120 # one window = one quarter

windowed_correlations = []

for offset in offsets:
    windows_corr, windows = windowed_corr(sent_vs_price['sent'], sent_vs_price['mid_price'].shift(-offset))
    for window, window_corr in enumerate(windows_corr):
        windowed_correlations.append((window, window_corr, offset))
    
    
windowed_correlations_df = pd.DataFrame(windowed_correlations, columns=['window', 'correlation', 'offset'])


plot_title = alt.TitleParams('Windowed lagged correlation sentiment/price', subtitle=['Positive offset: looking future prices',
                                                                                      '-1:= price as master, 1:= sentiment as master'])
color = alt.Color('correlation', scale=alt.Scale(domain=[-1, 1], range=[palette['negative'], palette['positive']]), title='Correlation')
alt.Chart(windowed_correlations_df, height=800, width=800, title=plot_title).mark_rect().encode(alt.X('window:O', title=f'Window ({window_size} days)'), alt.Y('offset:O', title='Offset days'), color)


As predictable, also the heatmap shows a correlation near 0 in most of the windows for most of the offsets.

## Volume analysis

Another aspect of data is the volume, in other words: is relevant that the people speak well or bad about Bitcoin or it's enough that people speak?

In [24]:
long_summary = summary[['date', 'news_count', 'reddit_count', 'twitter_count', 'telegram_count']]
long_summary = long_summary.dropna()
long_summary.columns = ['date', 'news', 'Reddit', 'Twitter', 'Telegram']
long_summary = long_summary.melt('date', value_name='Volume', var_name='Media')

In [25]:


media = ['news', 'Reddit', 'Twitter', 'Telegram']

colors = alt.Color('Media', scale=alt.Scale(domain=media, 
                                            range=[palette[media.lower()] for media in media]))

legend_selector = selection = alt.selection_multi(fields=['Media'], bind='legend')

time_selector = alt.selection(type='interval', encodings=['x'])

dummy_df = pd.DataFrame({'date': [min(summary['date']), max(summary['date'])], 'count': [0, 0]})
zero_line = alt.Chart(dummy_df).mark_line(color='grey').encode(x=alt.X('yearmonthdate(date):T'), y=alt.Y('count', title='Volume'))

price = alt.Chart(market_df).mark_line(color=palette['price']).encode(
    x=alt.X('yearmonthdate(date):T',
           scale=alt.Scale(domain=time_selector),
           title=None),
    y=alt.Y('mid_price', title='Mid price')
)
        
plot_title = alt.TitleParams('Volume vs Bitcoin price')

#histogram_reg = histogram.transform_regression('date', 'count', method='poly', order=9).mark_line(color=palette['strong_neutral_1'])

stacked = alt.Chart(long_summary).mark_bar().encode(
    x=alt.X('yearmonthdate(date):T',
           bin=alt.Bin(maxbins=100, extent=time_selector),
           scale=alt.Scale(domain=time_selector)),
    y=alt.Y('Volume'),
    color=colors,
    opacity=alt.condition(legend_selector, alt.value(1), alt.value(0.01))
).add_selection(legend_selector)

volume_chart = stacked  + zero_line

price_reg = price.transform_regression('date', 'mid_price', method='poly', order=9).mark_line(color=palette['strong_price'])


price_chart = price

selection_plot = alt.Chart(long_summary).mark_bar().encode(alt.X('yearmonthdate(date):T',
                                                       bin=alt.Bin(maxbins=100),
                                                            title='Date',
                                                            axis=alt.Axis(labelOverlap='greedy', labelSeparation=6)),
                                                       alt.Y('Volume', title=None),
                                                          color=colors,).add_selection(time_selector).properties(height=50)




(alt.layer(volume_chart, price_chart).resolve_scale(y='independent') & selection_plot).configure_axisRight(titleColor=palette['price'])

It's interesting to note that when the price is high, also the volume is high.

In addition, the rate Reddit/Twitter decreases over time, suggesting that the community left Reddit, or simply that the Reddit crawling rate was not consistent.

#### Correlation

##### TLCC

In [26]:
methods = ['pearson', 'kendall', 'spearman']
offsets = list(range(-150, 151)) # list of days offset to test

correlations = []

volume_vs_price = pd.DataFrame(summary['date'], columns=['date'])
volume_vs_price['volume'] = summary['count']
volume_vs_price = volume_vs_price.merge(market_df[['date', 'mid_price']], on='date')

for method in methods:
    method_correlations = [(method, offset, volume_vs_price['volume'].corr(sent_vs_price['mid_price'].shift(-offset), method=method))
                           for offset in offsets]
    correlations.extend(method_correlations)
        
correlations_df = pd.DataFrame(correlations, columns=['method', 'offset', 'correlation'])

spearman_correlations = correlations_df[correlations_df['method'] == 'spearman']

max_corr = max(spearman_correlations['correlation'])
max_corr_offset = spearman_correlations[spearman_correlations['correlation'] == max_corr]['offset'].iloc[0]

min_corr = min(spearman_correlations['correlation'])
min_corr_offset = spearman_correlations[spearman_correlations['correlation'] == min_corr]['offset'].iloc[0]

max_corr_text = f'Max correlation ({round(max_corr, 3)}) with an offset of {max_corr_offset} days'
min_corr_text = f'Min correlation ({round(min_corr, 3)}) with an offset of {min_corr_offset} days'

plot_title = alt.TitleParams('Media volume correlations', subtitle=['Positive offset: looking future prices', max_corr_text, min_corr_text])
corr_chart = alt.Chart(correlations_df, title=plot_title).mark_line().encode(alt.X('offset', title='Offset days'),
                                                          alt.Y('correlation', title='Correlation'),
                                                          alt.Color('method', title='Method'))

corr_chart

Correlations reach high good values, much better than sentiment correlations.
Interesting, despite the volume distribution, is not a normal distribution, the Pearson trend is, more or less, the same as the others two.

#### WLTCC

In [27]:
from math import ceil

def get_window(series: pd.Series, window) -> pd.Series:
    return series.iloc[window[0]: window[1]]
    

def windowed_corr(first: pd.Series, second: pd.Series) -> list:
    windows = [(window * window_size, (window * window_size)+window_size) for window in range(ceil(len(second)/window_size))]
    windows_corr = [get_window(first, window).corr(get_window(second, window), method = 'spearman') for window in windows]
    return windows_corr, windows

offsets = list(range(-66, 67, 4)) # reduced offsets for better visualization
window_size = 120 # one window = one quarter

windowed_correlations = []

for offset in offsets:
    windows_corr, windows = windowed_corr(volume_vs_price['volume'], volume_vs_price['mid_price'].shift(-offset))
    for window, window_corr in enumerate(windows_corr):
        windowed_correlations.append((window, window_corr, offset))
    
    
windowed_correlations_df = pd.DataFrame(windowed_correlations, columns=['window', 'correlation', 'offset'])


plot_title = alt.TitleParams('Windowed lagged correlation volume/price', subtitle=['Positive offset: looking future prices',
                                                                                   '-1:= price as master, 1:= sentiment as master'])
color = alt.Color('correlation', scale=alt.Scale(domain=[-1, 1], range=[palette['negative'], palette['positive']]), title='Correlation')
alt.Chart(windowed_correlations_df, height=800, width=800, title=plot_title).mark_rect().encode(alt.X('window:O', title=f'Window ({window_size} days)'), alt.Y('offset:O', title='Offset days'), color)


Through this heatmap, it can be seen that windows 0, 2, 10, 11, and 18 are the most problematic ones but the first windows are not significant due to the low data volume.