In [1]:
import pandas as pd

In [2]:
list_subs = ['bitcoin','bitcoinbeginners','bitcoinmarkets','bitcoinmining','btc']

In [3]:
dfs = []
for subreddit in list_subs:
    dfs.append(pd.read_csv(f'Data/Sentiment/crypto_bitcoin/{subreddit}_submission_sentiment.csv'))
    

In [4]:
final_df = pd.concat(dfs)

In [5]:
final_df['subreddit'].unique()

array(['bitcoin', 'bitcoinbeginners', 'bitcoinmarkets', 'bitcoinmining',
       'btc'], dtype=object)

In [6]:
final_df.columns

Index(['submission', 'author', 'num_comments', 'shortlink', 'link_flair_text',
       'title', 'selftext', 'score', 'subreddit', 'posted_on', 'label',
       'score.1'],
      dtype='object')

In [7]:
# Convert 'posted_on' to datetime
final_df['posted_on'] = pd.to_datetime(final_df['posted_on'])

# Set the 'posted_on' as the index (optional)
final_df.set_index('posted_on', inplace=True)

In [8]:
# Resample or group by a time period (e.g., daily) and count sentiment labels
# Here 'D' stands for daily. You can change it to 'H' for hourly, 'M' for monthly, etc.
time_series = final_df.resample('D')['label'].value_counts().unstack()
time_series.columns.name = None

In [9]:
time_series

Unnamed: 0_level_0,Negative,Neutral,Positive
posted_on,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,6.0,43.0,34.0
2022-01-02,5.0,50.0,30.0
2022-01-03,10.0,60.0,43.0
2022-01-04,8.0,48.0,27.0
2022-01-05,6.0,85.0,38.0
...,...,...,...
2022-12-27,1.0,25.0,8.0
2022-12-28,8.0,38.0,22.0
2022-12-29,3.0,27.0,20.0
2022-12-30,3.0,40.0,21.0


In [10]:
# Resetting index to make 'posted_on' a column again (optional)
time_series.reset_index(inplace=True)

# Your new DataFrame 'time_series' is ready
display(time_series)

Unnamed: 0,posted_on,Negative,Neutral,Positive
0,2022-01-01,6.0,43.0,34.0
1,2022-01-02,5.0,50.0,30.0
2,2022-01-03,10.0,60.0,43.0
3,2022-01-04,8.0,48.0,27.0
4,2022-01-05,6.0,85.0,38.0
...,...,...,...,...
360,2022-12-27,1.0,25.0,8.0
361,2022-12-28,8.0,38.0,22.0
362,2022-12-29,3.0,27.0,20.0
363,2022-12-30,3.0,40.0,21.0


In [11]:
time_series.isna().sum()

posted_on    0
Negative     8
Neutral      0
Positive     0
dtype: int64

In [12]:
time_series.fillna(value=0,inplace=True)

In [13]:
time_series.isna().sum()

posted_on    0
Negative     0
Neutral      0
Positive     0
dtype: int64

In [14]:
time_series['total_posts'] = time_series['Positive']+time_series['Negative']+time_series['Neutral']
time_series

Unnamed: 0,posted_on,Negative,Neutral,Positive,total_posts
0,2022-01-01,6.0,43.0,34.0,83.0
1,2022-01-02,5.0,50.0,30.0,85.0
2,2022-01-03,10.0,60.0,43.0,113.0
3,2022-01-04,8.0,48.0,27.0,83.0
4,2022-01-05,6.0,85.0,38.0,129.0
...,...,...,...,...,...
360,2022-12-27,1.0,25.0,8.0,34.0
361,2022-12-28,8.0,38.0,22.0,68.0
362,2022-12-29,3.0,27.0,20.0,50.0
363,2022-12-30,3.0,40.0,21.0,64.0


### Saving the Count Time Series

In [16]:
time_series_positive = time_series[['posted_on','Positive']]
display(time_series_positive)
time_series_positive.to_csv("Data/Time_Series/crypto_bitcoin/positive_count.csv",index=False)

Unnamed: 0,posted_on,Positive
0,2022-01-01,34.0
1,2022-01-02,30.0
2,2022-01-03,43.0
3,2022-01-04,27.0
4,2022-01-05,38.0
...,...,...
360,2022-12-27,8.0
361,2022-12-28,22.0
362,2022-12-29,20.0
363,2022-12-30,21.0


In [17]:
time_series_negative = time_series[['posted_on','Negative']]
display(time_series_negative)
time_series_negative.to_csv("Data/Time_Series/crypto_bitcoin/negative_count.csv",index=False)

Unnamed: 0,posted_on,Negative
0,2022-01-01,6.0
1,2022-01-02,5.0
2,2022-01-03,10.0
3,2022-01-04,8.0
4,2022-01-05,6.0
...,...,...
360,2022-12-27,1.0
361,2022-12-28,8.0
362,2022-12-29,3.0
363,2022-12-30,3.0


In [18]:
time_series_neutral = time_series[['posted_on','Neutral']]
display(time_series_neutral)
time_series_neutral.to_csv("Data/Time_Series/crypto_bitcoin/neutral_count.csv",index=False)

Unnamed: 0,posted_on,Neutral
0,2022-01-01,43.0
1,2022-01-02,50.0
2,2022-01-03,60.0
3,2022-01-04,48.0
4,2022-01-05,85.0
...,...,...
360,2022-12-27,25.0
361,2022-12-28,38.0
362,2022-12-29,27.0
363,2022-12-30,40.0


### Saving the Proportion Time Series

In [21]:
#Calculating sentiment proportions
time_series['Positive'] = round((time_series['Positive']/time_series['total_posts'])*100,2)
time_series['Negative'] = round((time_series['Negative']/time_series['total_posts'])*100,2)
time_series['Neutral'] = round((time_series['Neutral']/time_series['total_posts'])*100,2)
time_series

Unnamed: 0,posted_on,Negative,Neutral,Positive,total_posts
0,2022-01-01,7.23,51.81,40.96,83.0
1,2022-01-02,5.88,58.82,35.29,85.0
2,2022-01-03,8.85,53.10,38.05,113.0
3,2022-01-04,9.64,57.83,32.53,83.0
4,2022-01-05,4.65,65.89,29.46,129.0
...,...,...,...,...,...
360,2022-12-27,2.94,73.53,23.53,34.0
361,2022-12-28,11.76,55.88,32.35,68.0
362,2022-12-29,6.00,54.00,40.00,50.0
363,2022-12-30,4.69,62.50,32.81,64.0


In [22]:
time_series_positive = time_series[['posted_on','Positive']]
display(time_series_positive)
time_series_positive.to_csv("Data/Time_Series/crypto_bitcoin/positive_proportion.csv",index=False)

Unnamed: 0,posted_on,Positive
0,2022-01-01,40.96
1,2022-01-02,35.29
2,2022-01-03,38.05
3,2022-01-04,32.53
4,2022-01-05,29.46
...,...,...
360,2022-12-27,23.53
361,2022-12-28,32.35
362,2022-12-29,40.00
363,2022-12-30,32.81


In [23]:
time_series_negative = time_series[['posted_on','Negative']]
display(time_series_negative)
time_series_negative.to_csv("Data/Time_Series/crypto_bitcoin/negative_proportion.csv",index=False)

Unnamed: 0,posted_on,Negative
0,2022-01-01,7.23
1,2022-01-02,5.88
2,2022-01-03,8.85
3,2022-01-04,9.64
4,2022-01-05,4.65
...,...,...
360,2022-12-27,2.94
361,2022-12-28,11.76
362,2022-12-29,6.00
363,2022-12-30,4.69


In [24]:
time_series_neutral = time_series[['posted_on','Neutral']]
display(time_series_neutral)
time_series_neutral.to_csv("Data/Time_Series/crypto_bitcoin/neutral_proportion.csv",index=False)

Unnamed: 0,posted_on,Neutral
0,2022-01-01,51.81
1,2022-01-02,58.82
2,2022-01-03,53.10
3,2022-01-04,57.83
4,2022-01-05,65.89
...,...,...
360,2022-12-27,73.53
361,2022-12-28,55.88
362,2022-12-29,54.00
363,2022-12-30,62.50


In [25]:
time_series_size = time_series[['posted_on','total_posts']]
display(time_series_size)
time_series_size.to_csv("Data/Time_Series/crypto_bitcoin/total_count.csv",index=False)

Unnamed: 0,posted_on,total_posts
0,2022-01-01,83.0
1,2022-01-02,85.0
2,2022-01-03,113.0
3,2022-01-04,83.0
4,2022-01-05,129.0
...,...,...
360,2022-12-27,34.0
361,2022-12-28,68.0
362,2022-12-29,50.0
363,2022-12-30,64.0
