In [1]:
import pandas as pd

In [2]:
model = 'CRYPTOBERT'

In [3]:
# filetype =  'submission'
filetype =  'comment'

In [4]:
# crypto = 'crypto_bitcoin'
crypto = 'crypto_ethereum'
# crypto = 'crypto_xrp'

In [5]:
# list_subs = ['bitcoin','bitcoinbeginners','bitcoinmarkets','bitcoinmining','btc']
list_subs = ['ethereum','ethermining','ethfinance','eth','ethtrader']
# list_subs = ['xrp','ripple']

In [6]:
dfs = []
for subreddit in list_subs:
    dfs.append(pd.read_csv(f'Data/Sentiment/{crypto}/{model}/{subreddit}_{filetype}_sentiment.csv'))
    

In [7]:
final_df = pd.concat(dfs)

In [8]:
final_df['subreddit'].unique()

array(['ethereum', 'EtherMining', 'ethfinance', 'eth', 'ethtrader'],
      dtype=object)

In [9]:
final_df.columns

Index(['author', 'body', 'id', 'parent_id', 'permalink', 'score', 'subreddit',
       'posted_on', 'label', 'score.1'],
      dtype='object')

In [10]:
# Convert 'posted_on' to datetime
final_df['posted_on'] = pd.to_datetime(final_df['posted_on'])

# Set the 'posted_on' as the index (optional)
final_df.set_index('posted_on', inplace=True)

In [11]:
# Resample or group by a time period (e.g., daily) and count sentiment labels
# Here 'D' stands for daily. You can change it to 'H' for hourly, 'M' for monthly, etc.
time_series = final_df.resample('D')['label'].value_counts().unstack()
time_series.columns.name = None

In [12]:
time_series

Unnamed: 0_level_0,Negative,Neutral,Positive
posted_on,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,16,245,85
2022-01-02,41,365,164
2022-01-03,36,360,136
2022-01-04,24,278,133
2022-01-05,40,322,144
...,...,...,...
2022-12-27,6,74,36
2022-12-28,12,97,28
2022-12-29,11,72,44
2022-12-30,4,40,10


In [13]:
# Resetting index to make 'posted_on' a column again (optional)
time_series.reset_index(inplace=True)

# Your new DataFrame 'time_series' is ready
display(time_series)

Unnamed: 0,posted_on,Negative,Neutral,Positive
0,2022-01-01,16,245,85
1,2022-01-02,41,365,164
2,2022-01-03,36,360,136
3,2022-01-04,24,278,133
4,2022-01-05,40,322,144
...,...,...,...,...
360,2022-12-27,6,74,36
361,2022-12-28,12,97,28
362,2022-12-29,11,72,44
363,2022-12-30,4,40,10


In [14]:
time_series.isna().sum()

posted_on    0
Negative     0
Neutral      0
Positive     0
dtype: int64

In [15]:
time_series.fillna(value=0,inplace=True)

In [16]:
time_series.isna().sum()

posted_on    0
Negative     0
Neutral      0
Positive     0
dtype: int64

In [17]:
time_series['total_posts'] = time_series['Positive']+time_series['Negative']+time_series['Neutral']
time_series

Unnamed: 0,posted_on,Negative,Neutral,Positive,total_posts
0,2022-01-01,16,245,85,346
1,2022-01-02,41,365,164,570
2,2022-01-03,36,360,136,532
3,2022-01-04,24,278,133,435
4,2022-01-05,40,322,144,506
...,...,...,...,...,...
360,2022-12-27,6,74,36,116
361,2022-12-28,12,97,28,137
362,2022-12-29,11,72,44,127
363,2022-12-30,4,40,10,54


### Saving the Count Time Series

In [18]:
time_series.to_csv(f"Data/Time_Series/{crypto}/{model}_{filetype}_sentiment.csv",index=False)

In [17]:
label = ['positive','negative','neutral']
for name in label:
    col = name.title()
    time_series_df = time_series[['posted_on',col,'total_posts']]
    display(time_series_df)
    time_series_df.to_csv(f"Data/Time_Series/{crypto}/{name}_{filetype}_count.csv",index=False)

Unnamed: 0,posted_on,Positive,total_posts
0,2022-01-01,85,346
1,2022-01-02,164,570
2,2022-01-03,136,532
3,2022-01-04,133,435
4,2022-01-05,144,506
...,...,...,...
360,2022-12-27,36,116
361,2022-12-28,28,137
362,2022-12-29,44,127
363,2022-12-30,10,54


Unnamed: 0,posted_on,Negative,total_posts
0,2022-01-01,16,346
1,2022-01-02,41,570
2,2022-01-03,36,532
3,2022-01-04,24,435
4,2022-01-05,40,506
...,...,...,...
360,2022-12-27,6,116
361,2022-12-28,12,137
362,2022-12-29,11,127
363,2022-12-30,4,54


Unnamed: 0,posted_on,Neutral,total_posts
0,2022-01-01,245,346
1,2022-01-02,365,570
2,2022-01-03,360,532
3,2022-01-04,278,435
4,2022-01-05,322,506
...,...,...,...
360,2022-12-27,74,116
361,2022-12-28,97,137
362,2022-12-29,72,127
363,2022-12-30,40,54


### Saving the Proportion Time Series

In [21]:
#Calculating sentiment proportions
time_series['Positive'] = round((time_series['Positive']/time_series['total_posts'])*100,2)
time_series['Negative'] = round((time_series['Negative']/time_series['total_posts'])*100,2)
time_series['Neutral'] = round((time_series['Neutral']/time_series['total_posts'])*100,2)
time_series

Unnamed: 0,posted_on,Negative,Neutral,Positive,total_posts
0,2022-01-01,7.23,51.81,40.96,83.0
1,2022-01-02,5.88,58.82,35.29,85.0
2,2022-01-03,8.85,53.10,38.05,113.0
3,2022-01-04,9.64,57.83,32.53,83.0
4,2022-01-05,4.65,65.89,29.46,129.0
...,...,...,...,...,...
360,2022-12-27,2.94,73.53,23.53,34.0
361,2022-12-28,11.76,55.88,32.35,68.0
362,2022-12-29,6.00,54.00,40.00,50.0
363,2022-12-30,4.69,62.50,32.81,64.0


In [22]:
time_series_positive = time_series[['posted_on','Positive']]
display(time_series_positive)
time_series_positive.to_csv("Data/Time_Series/crypto_bitcoin/positive_proportion.csv",index=False)

Unnamed: 0,posted_on,Positive
0,2022-01-01,40.96
1,2022-01-02,35.29
2,2022-01-03,38.05
3,2022-01-04,32.53
4,2022-01-05,29.46
...,...,...
360,2022-12-27,23.53
361,2022-12-28,32.35
362,2022-12-29,40.00
363,2022-12-30,32.81


In [23]:
time_series_negative = time_series[['posted_on','Negative']]
display(time_series_negative)
time_series_negative.to_csv("Data/Time_Series/crypto_bitcoin/negative_proportion.csv",index=False)

Unnamed: 0,posted_on,Negative
0,2022-01-01,7.23
1,2022-01-02,5.88
2,2022-01-03,8.85
3,2022-01-04,9.64
4,2022-01-05,4.65
...,...,...
360,2022-12-27,2.94
361,2022-12-28,11.76
362,2022-12-29,6.00
363,2022-12-30,4.69


In [24]:
time_series_neutral = time_series[['posted_on','Neutral']]
display(time_series_neutral)
time_series_neutral.to_csv("Data/Time_Series/crypto_bitcoin/neutral_proportion.csv",index=False)

Unnamed: 0,posted_on,Neutral
0,2022-01-01,51.81
1,2022-01-02,58.82
2,2022-01-03,53.10
3,2022-01-04,57.83
4,2022-01-05,65.89
...,...,...
360,2022-12-27,73.53
361,2022-12-28,55.88
362,2022-12-29,54.00
363,2022-12-30,62.50


In [25]:
time_series_size = time_series[['posted_on','total_posts']]
display(time_series_size)
time_series_size.to_csv("Data/Time_Series/crypto_bitcoin/total_count.csv",index=False)

Unnamed: 0,posted_on,total_posts
0,2022-01-01,83.0
1,2022-01-02,85.0
2,2022-01-03,113.0
3,2022-01-04,83.0
4,2022-01-05,129.0
...,...,...
360,2022-12-27,34.0
361,2022-12-28,68.0
362,2022-12-29,50.0
363,2022-12-30,64.0
