In [25]:
import os
import json
import requests

import pandas as pd
import mplfinance as mpf
import matplotlib.pyplot as plt


Loading pickled GME stock price dataframe

In [28]:
# Replace this with the path to your pickled file
pickle_file_path = '../scraping/gme_daily_transformed_df.pkl'

# Load the DataFrame from the pickle file
gme_daily_transformed_df = pd.read_pickle(pickle_file_path)

Filtering dataframe to only include dates in the relevant period

In [29]:
# Filtering the DataFrame to include only data from December 2020 to April 2021 and creating a copy
gme_jan_apr2021_df = gme_daily_transformed_df[(gme_daily_transformed_df['Date'] >= '2021-01-01') & (gme_daily_transformed_df['Date'] <= '2021-04-01')].copy()

Reading Reddit data from csv files into dataframe

In [30]:
data1_df = pd.read_csv('./data/reddit_data/Submissions_2021-01_FilteredBySubreddit_GME.csv')
data2_df = pd.read_csv('./data/reddit_data/Submissions_2021-02_FilteredBySubreddit_GME.csv')
data3_df = pd.read_csv('./data/reddit_data/Submissions_2021-03_FilteredBySubreddit_GME.csv')

df_all_reddit_data = pd.concat([data1_df, data2_df, data3_df], axis = 0,
                        ignore_index=True)


FileNotFoundError: [Errno 2] No such file or directory: './data/reddit_data/Submissions_2021-01_FilteredBySubreddit_GME.csv'

In [None]:
df_all_reddit_data.head()

Unnamed: 0,submission_id,redditor_name,created_at,title,text,subreddit,permalink,attachment,flair,score,num_comments,edited
0,ko124i,[deleted],2021-01-01T00:02:06,"3k - 170k since March (Also, buy LIT!!)",[deleted],wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,,"{'link': 'Gain', 'author': None}",{'2021-06-10T13:50:58': 34},{'2021-06-10T13:50:58': 14},False
1,ko12uq,[deleted],2021-01-01T00:03:20,Got out of PLTR calls after learning about IV ...,[deleted],wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,{'url': 'https://www.reddit.com/gallery/ko12uq'},"{'link': 'Gain', 'author': None}",{'2021-06-10T13:50:59': 2},{'2021-06-10T13:50:59': 0},False
2,ko13df,[deleted],2021-01-01T00:04:11,Hell of a headline,[deleted],wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,{'png': 'https://i.redd.it/620igsuk3m861.png'},"{'link': 'Meme', 'author': None}",{'2021-06-10T13:50:59': 14},{'2021-06-10T13:50:59': 7},False
3,ko13q2,DarkCookie243,2021-01-01T00:04:47,A message from JPow for New Years Eve to all o...,,wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,{'jpg': 'https://i.redd.it/rkb331xu3m861.jpg'},"{'link': 'Meme', 'author': None}",{'2021-06-10T13:51:00': 4},{'2021-06-10T13:51:00': 0},False
4,ko145e,stevenconrad,2021-01-01T00:05:29,"GME to 420.69, but only if we make it happen. ...",[removed],wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,,"{'link': 'DD', 'author': None}",{'2021-06-10T13:51:00': 9},{'2021-06-10T13:51:00': 4},False


Processing dataframe

In [None]:
# Remove curly brackets from num_comments and score columns
df_all_reddit_data['num_comments'] = df_all_reddit_data['num_comments'].str.replace('[{}]'.format(''.join(['{}'])), '', regex=True)
df_all_reddit_data['score'] = df_all_reddit_data['score'].str.replace('[{}]'.format(''.join(['{}'])), '', regex=True)


# Remove date and time from num_comments and score columns
df_all_reddit_data['num_comments'] = df_all_reddit_data['num_comments'].str.split(':').str[-1].str.strip()
df_all_reddit_data['score'] = df_all_reddit_data['score'].str.split(':').str[-1].str.strip()


# Convert num_comments and score columns to integers
df_all_reddit_data['num_comments'] = df_all_reddit_data['num_comments'].astype(int)
df_all_reddit_data['score'] = df_all_reddit_data['score'].astype(int)


df_all_reddit_data['created_at'] = pd.to_datetime(df_all_reddit_data['created_at'])
# Extract only the date part
df_all_reddit_data['created_at'] = df_all_reddit_data['created_at'].dt.normalize()


In [None]:
df_all_reddit_data.head()


Unnamed: 0,submission_id,redditor_name,created_at,title,text,subreddit,permalink,attachment,flair,score,num_comments,edited
0,ko124i,[deleted],2021-01-01,"3k - 170k since March (Also, buy LIT!!)",[deleted],wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,,"{'link': 'Gain', 'author': None}",34,14,False
1,ko12uq,[deleted],2021-01-01,Got out of PLTR calls after learning about IV ...,[deleted],wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,{'url': 'https://www.reddit.com/gallery/ko12uq'},"{'link': 'Gain', 'author': None}",2,0,False
2,ko13df,[deleted],2021-01-01,Hell of a headline,[deleted],wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,{'png': 'https://i.redd.it/620igsuk3m861.png'},"{'link': 'Meme', 'author': None}",14,7,False
3,ko13q2,DarkCookie243,2021-01-01,A message from JPow for New Years Eve to all o...,,wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,{'jpg': 'https://i.redd.it/rkb331xu3m861.jpg'},"{'link': 'Meme', 'author': None}",4,0,False
4,ko145e,stevenconrad,2021-01-01,"GME to 420.69, but only if we make it happen. ...",[removed],wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,,"{'link': 'DD', 'author': None}",9,4,False


Groupby - keep looking into

In [None]:
df_all_reddit_data.rename(columns={'created_at': 'Date'}, inplace=True)
df_grouped = df_all_reddit_data.groupby('Date')

df_grouped = df_all_reddit_data.groupby('Date').size().reset_index(name='Post_Count')

df_comments_count = df_all_reddit_data.groupby('Date')['num_comments'].sum().reset_index(name='Total_Comments')

# Merge 'df_grouped' with 'df_comments_count' on 'Date'
df_grouped = df_grouped.merge(df_comments_count, on='Date', how='left')

df_grouped


# Now, you can perform operations on each group
# For example, you can calculate the sum of 'score' and 'num_comments' for each date
#result = df_grouped[['score', 'num_comments']].sum()

# The result DataFrame will have 'created_at' date as the index and sum of 'score' and 'num_comments' for each date
#print(result)

Unnamed: 0,Date,Post_Count,Total_Comments
0,2021-01-01,589,45186
1,2021-01-02,625,11609
2,2021-01-03,613,25134
3,2021-01-04,1029,55169
4,2021-01-05,965,65804
...,...,...,...
86,2021-03-28,1178,30131
87,2021-03-29,2065,85797
88,2021-03-30,2105,91976
89,2021-03-31,2147,82231


In [None]:
merged_df = df_grouped.merge(gme_jan_apr2021_df, on='Date', how='outer')
merged_df

Unnamed: 0,Date,Post_Count,Total_Comments,Open,High,Low,Close,Volume
0,2021-01-01,589,45186,,,,,
1,2021-01-02,625,11609,,,,,
2,2021-01-03,613,25134,,,,,
3,2021-01-04,1029,55169,19.00,19.1000,17.1500,17.25,10022474.0
4,2021-01-05,965,65804,17.35,18.0765,17.2300,17.37,4961457.0
...,...,...,...,...,...,...,...,...
86,2021-03-28,1178,30131,,,,,
87,2021-03-29,2065,85797,180.75,193.9200,173.5100,181.30,10042175.0
88,2021-03-30,2105,91976,187.50,204.3000,182.0000,194.46,17094924.0
89,2021-03-31,2147,82231,197.50,199.4600,187.1102,189.82,8393834.0


In [None]:
cleaned_df = merged_df.dropna()
cleaned_df['Volume'] = cleaned_df['Volume'].astype(int)
# Remove the last row using .iloc
cleaned_df = cleaned_df.iloc[:-1]
cleaned_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['Volume'] = cleaned_df['Volume'].astype(int)


Unnamed: 0,Date,Post_Count,Total_Comments,Open,High,Low,Close,Volume
3,2021-01-04,1029,55169,19.00,19.1000,17.1500,17.25,10022474
4,2021-01-05,965,65804,17.35,18.0765,17.2300,17.37,4961457
5,2021-01-06,1225,77419,17.34,18.9800,17.3300,18.36,6056248
6,2021-01-07,1399,68127,18.47,19.4500,18.0200,18.08,6129276
7,2021-01-08,1551,95624,18.18,18.3000,17.0800,17.69,6481960
...,...,...,...,...,...,...,...,...
83,2021-03-25,3493,146707,123.49,187.5000,116.9000,183.75,49926442
84,2021-03-26,2986,148208,197.68,218.9344,163.2600,181.00,37430672
87,2021-03-29,2065,85797,180.75,193.9200,173.5100,181.30,10042175
88,2021-03-30,2105,91976,187.50,204.3000,182.0000,194.46,17094924


In [None]:
cleaned_df['Post_Count'].max()

166996

# This is the plot
### Maybe animate it

In [None]:
from bokeh.plotting import figure, show, output_file, save
from bokeh.models import ColumnDataSource, LinearColorMapper, ColorBar, NumeralTickFormatter, Title
from bokeh.transform import transform
from bokeh.models.tools import HoverTool
from bokeh.models import Range1d  # Import Range1d for secondary y-axis
import pandas as pd

# Ensure your dataframe is sorted by date if it's not already
cleaned_df['Date'] = pd.to_datetime(cleaned_df['Date'])
cleaned_df.sort_values('Date', inplace=True)

# Create a ColumnDataSource from the dataframe
source = ColumnDataSource(cleaned_df)

# Create a color mapper for total comments with a color bar
color_mapper = LinearColorMapper(palette="Viridis256", low=cleaned_df['Total_Comments'].min(), high=cleaned_df['Total_Comments'].max())

# Define a size mapping based on 'Post_Count', here we scale the 'Post_Count' values to a range of 6 to 30
# Adjust this scaling factor to suit the size of your plot or preference
scale_factor = 0.0015  # This is a scaling factor for the size
max_size = 10000
min_size = 5
cleaned_df['size'] = cleaned_df['Post_Count'] * scale_factor
cleaned_df['size'] = cleaned_df['size'].clip(lower=min_size, upper=max_size)

# Update the source with the new size data
source.data['size'] = cleaned_df['size']

# Create a figure object
p = figure(width=1450, height=700, x_axis_type="datetime")

# Create a title and customize its properties
title = Title(text="GameStop Trading Frenzy: A Synchronized Surge in Stock Price, Volume, and Online Buzz", text_font_size="16pt", align="center")

# Add circle glyphs to the figure
p.circle(x='Date', y='Volume', size='size', source=source, color=transform('Total_Comments', color_mapper), alpha=0.7)

# Add a color bar to the right of the plot
color_bar = ColorBar(color_mapper=color_mapper, label_standoff=12, location=(0,0), title='Total Comments', formatter=NumeralTickFormatter(format="0,0"))
p.add_layout(color_bar, 'below')

# Customize the plot
p.yaxis.formatter = NumeralTickFormatter(format="0a")
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Trading Volume (in millions)'

# Specify the range for the secondary y-axis (right side)
p.extra_y_ranges = {'Close Price': Range1d(start=cleaned_df['Close'].min(), end=400)}

# Add the secondary y-axis (right side) for Close Price
p.line('Date', 'Close', source=source, color='red', y_range_name='Close Price', legend_label='Close Price')
p.add_layout(LinearAxis(y_range_name='Close Price', axis_label='Close Price'), 'right')

# Add hover tool
hover = HoverTool(tooltips=[("Date", "@Date{%F}"),
                            ("Volume", "@Volume"),
                            ("Post Count", "@Post_Count"),
                            ("Total Comments", "@Total_Comments"),
                            ("Close Price", "@Close")],
                  formatters={'@Date': 'datetime'})
p.add_tools(hover)

# Set the title
p.title = title

# Show the plot
output_file("interactive_plot.html")
save(p)


'c:\\Users\\annaj\\Desktop\\DS105\\ds105a-project-data-dabblers\\interactive_plot.html'

In [None]:
from bokeh.layouts import column
from bokeh.plotting import figure, show, output_file, save, curdoc
from bokeh.models import ColumnDataSource, LinearColorMapper, ColorBar, NumeralTickFormatter, Title
from bokeh.transform import transform
from bokeh.models import HoverTool
from bokeh.models.widgets import Slider
from bokeh.models.callbacks import CustomJS
from bokeh.models import Range1d
import pandas as pd
from bokeh.plotting import output_file, save


# Ensure your dataframe is sorted by date if it's not already
cleaned_df['Date'] = pd.to_datetime(cleaned_df['Date'])
cleaned_df.sort_values('Date', inplace=True)

# Create a ColumnDataSource from the dataframe
source = ColumnDataSource(cleaned_df)

# Create a color mapper for total comments with a color bar
color_mapper = LinearColorMapper(palette="Viridis256", low=cleaned_df['Total_Comments'].min(), high=cleaned_df['Total_Comments'].max())

# Define a size mapping based on 'Post_Count', here we scale the 'Post_Count' values to a range of 6 to 30
# Adjust this scaling factor to suit the size of your plot or preference
scale_factor = 0.0015  # This is a scaling factor for the size
max_size = 10000
min_size = 5
cleaned_df['size'] = cleaned_df['Post_Count'] * scale_factor
cleaned_df['size'] = cleaned_df['size'].clip(lower=min_size, upper=max_size)

# Update the source with the new size data
source.data['size'] = cleaned_df['size']

# Create a figure object
p = figure(width=1450, height=700, x_axis_type="datetime")

# Create a title and customize its properties
title = Title(text="WSB Activity vs GME Trading Volume", text_font_size="20pt", align="center")

# Add circle glyphs to the figure with initial visible count
initial_visible_count = 10
p.circle(x='Date', y='Volume', size='size', source=source, color=transform('Total_Comments', color_mapper), alpha=0.7, legend_label='Total Comments', muted_alpha=0.2, name='circles', visible=False)
p.legend.click_policy = 'mute'

# Add a color bar to the right of the plot
color_bar = ColorBar(color_mapper=color_mapper, label_standoff=12, location=(0, 0), title='Total Comments', formatter=NumeralTickFormatter(format="0,0"))
p.add_layout(color_bar, 'below')

legend_sizes = [6, 18, 30]  # Example sizes
legend_labels = ['Low', 'Medium', 'High']  # Example labels

# Create a legend by adding invisible circles (visible in the legend only)
for size, label in zip(legend_sizes, legend_labels):
    p.circle(x=[], y=[], size=size, color='grey', alpha=0.7, legend_label=label)

# Customize the plot
p.yaxis.formatter = NumeralTickFormatter(format="0a")
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Trading Volume (in millions)'

# Specify the range for the secondary y-axis (right side)
p.extra_y_ranges = {'Close Price': Range1d(start=cleaned_df['Close'].min(), end=cleaned_df['Close'].max())}

# Add the secondary y-axis (right side) for Close Price
p.line('Date', 'Close', source=source, color='red', y_range_name='Close Price', legend_label='Close Price')
p.add_layout(LinearAxis(y_range_name='Close Price', axis_label='Close Price'), 'right')

# Add hover tool
hover = HoverTool(tooltips=[("Date", "@Date{%F}"),
                            ("Volume", "@Volume"),
                            ("Post Count", "@Post_Count"),
                            ("Total Comments", "@Total_Comments"),
                            ("Close Price", "@Close")],
                  formatters={'@Date': 'datetime'})
p.add_tools(hover)

# Set the title
p.title = title

# Add a slider widget for animation
slider = Slider(start=0, end=len(source.data['Date']) - initial_visible_count, step=1, value=0, title="Animate Circles")
slider.js_on_change("value", CustomJS(args=dict(source=source, slider=slider), code="""
    var start = slider.value;
    var end = start + %d;
    var circles = source.data['Date'].map(function(date, index) {
        return index >= start && index < end;
    });
    source.data['circles'] = circles;
""" % initial_visible_count))

# Create a layout for the plot and slider
layout = column(p, slider)

# To save to an HTML file
output_file("animated_plot.html")
save(layout)

'c:\\Users\\annaj\\Desktop\\DS105\\ds105a-project-data-dabblers\\animated_plot.html'

In [None]:
# If 'Date' is not a column, this will reset the index and make 'Date' a column
if 'Date' not in gme_jan_apr2021_df.columns:
    gme_jan_apr2021_df.reset_index(inplace=True)

# Ensure 'Date' is in datetime format
gme_jan_apr2021_df['Date'] = pd.to_datetime(gme_jan_apr2021_df['Date'])

# Create the figure
p = figure(title="GME Daily Closing Prices (Jan 2021 - Apr 2021)",
           x_axis_label='Date', y_axis_label='Close Price ($)',
           x_axis_type='datetime', width=800, height=400)

# Add a line renderer with legend and line thickness
p.line('Date', 'Close', source=source, legend_label="Close Price", line_width=2)

# Dates for the red circles (buttons)
button_dates = [pd.Timestamp('2021-01-11'), 
                pd.Timestamp('2021-01-13'), 
                pd.Timestamp('2021-01-26'),
                pd.Timestamp('2021-01-27'), 
                pd.Timestamp('2021-01-28'), 
                pd.Timestamp('2021-01-29'),
                pd.Timestamp('2021-02-01'),
                pd.Timestamp('2021-02-25')]


# Extract close prices for the specified button dates
button_close_prices = [
    gme_jan_apr2021_df.loc[gme_jan_apr2021_df['Date'] == date, 'Close'].iloc[0]
    for date in button_dates
]

# Special information for each button date
special_infos = ["New leaders appointed", 
                 "Stock surges more than 50%",
                 "Elon Musk tweet",
                 "Citron Capital and Melvin Capital close their short positions",
                 "Robinhood restricts trading of GME",
                 "SEC intervenes and Trading platforms lift restrictions on GME",
                 "Decline in value",
                 "Temporary resurgence"]

# Create a separate ColumnDataSource for the red circles
button_source = ColumnDataSource({
    'x': button_dates,
    'y': button_close_prices,
    'info': special_infos
})

# Add the red circles to the plot
circle_renderer = p.circle('x', 'y', size=15, color='red', source=button_source)

# Create a hover tool for the main plot line
line_hover = HoverTool(
    tooltips=[
        ('Date', '@Date{%F}'),
        ('Close', '$@Close{0,0.00}')
    ],
    formatters={'@Date': 'datetime'},
    mode='vline'
)

# Create a hover tool for the red circles that displays special information
circle_hover = HoverTool(
    tooltips=[
        ('Info', '@info')
    ],
    # Only apply this hover tool to the red circles
    renderers=[circle_renderer]
)


p.add_tools(line_hover)

# Add the hover tool to the plot for the red circles
p.add_tools(circle_hover)

# Updated format for the datetime axis
p.xaxis.formatter = DatetimeTickFormatter(
    days="%d %b %Y",
    months="%b %Y",
    years="%Y"
)

output_file("interactive_plot2.html")
save(p)

RuntimeError: Models must be owned by only a single document, Selection(id='p1305', ...) is already in a doc