In [None]:
# Imports and Setup
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import sys
import os
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
from scipy.stats import spearmanr, pearsonr
from statsmodels.tsa.stattools import adfuller
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
import plotly as plt

# Environment Setup
GAME_PRICE_PREDICTION_PATH = os.environ.get('GAME_PRICE_PREDICTION_PATH', '')
sys.path.insert(0, os.path.abspath(GAME_PRICE_PREDICTION_PATH))

# Custom Imports
from python_scripts.utilities.api_calls import get_cookie_from_blob, fetch_item_to_df, fetch_items
from python_scripts.sentiment_analysis.config import ALL_MENTIONS_FILENAME, ITEM, POLARITY_FOLDER_NAME, ITEM_SANITIZED
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" 
- Takes data from mention_data 
- and a range of others; polarity_data, or fetches price history
- Plots it sexily
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" 

## prerequisites


In [None]:
# Make sure the correct item is imported from config.py
print(f"ITEM = {ITEM}")

In [3]:
filter_file_path = os.path.join(GAME_PRICE_PREDICTION_PATH, 'python_scripts', 'sentiment_analysis', 'filter_file.py')
!python "{filter_file_path}"

mention_counter_path = os.path.join(GAME_PRICE_PREDICTION_PATH, 'python_scripts', 'sentiment_analysis', 'mention_counter.py')
!python "{mention_counter_path}"

mention_data_combiner_path = os.path.join(GAME_PRICE_PREDICTION_PATH, 'python_scripts', 'sentiment_analysis', 'mention_data_combiner.py')
!python "{mention_data_combiner_path}"

# vader_polarity_path = os.path.join(GAME_PRICE_PREDICTION_PATH, 'python_scripts', 'sentiment_analysis', 'vader_polarity.py')
# !python "{vader_polarity_path}"


### Fetch item history and import mentions data for item

In [4]:
daily_cookie = get_cookie_from_blob()
price_volume_df = fetch_item_to_df(ITEM, daily_cookie)

# load mentions data
mentions_path = os.path.join(
    GAME_PRICE_PREDICTION_PATH,
    'data',
    'reddit_data',
    'mention_all',
    ALL_MENTIONS_FILENAME
)
mentions_df = pd.read_csv(mentions_path)
mentions_df['date'] = pd.to_datetime(mentions_df['date'])

## Merge dataframes, handle missing values and outliers

In [5]:
# Ensure df has a 'date' column
if 'date' not in price_volume_df.columns:
    price_volume_df = price_volume_df.reset_index()

# Merge mentions and price/volume data
merged_df = pd.merge(
    mentions_df,
    price_volume_df[['date', 'price_usd', 'volume']],
    on='date',
    how='outer'
).sort_values('date')

# Handle missing values more conservatively:
# 1. First handle price/volume data
merged_df['price_usd'] = merged_df['price_usd'].ffill(limit=2)  # only fill 2-day gaps
merged_df['volume'] = merged_df['volume'].fillna(0)  # no trades = 0 volume

# 2. For mentions data, only interpolate very short gaps
mentions_cols = ['num_mentions']  # Assuming 'num_mentions' is the column for mentions
for col in mentions_cols:
    merged_df[col] = merged_df[col].interpolate(method='linear', limit=1)  # only fill 1-day gaps

# 3. Drop remaining rows with missing values
merged_df = merged_df.dropna()

# Remove outliers (optional)
# z_scores = np.abs(stats.zscore(merged_df[['num_mentions', 'price_usd', 'volume']]))
# merged_df = merged_df[(z_scores < 3).all(axis=1)]

 ## timeseries - mentions against volume plot

In [None]:
# Create a copy for this analysis
mentions_vol_df = merged_df.copy()

# Set the prediction lag in days
prediction_days = 10  # Adjust this value as needed

# Shift volume forward (future volume)
mentions_vol_df['future_volume'] = mentions_vol_df['volume'].shift(-prediction_days)

# Calculate smoothed versions
window_size = 20
mentions_vol_df['smooth_mentions'] = mentions_vol_df['num_mentions'].rolling(window=window_size).mean()
mentions_vol_df['smooth_future_volume'] = mentions_vol_df['future_volume'].rolling(window=window_size).mean()

# Remove NaN values created by the shift and smoothing
mentions_vol_df = mentions_vol_df.dropna(subset=['smooth_mentions', 'smooth_future_volume'])

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add smoothed traces
fig.add_trace(
    go.Scatter(x=mentions_vol_df['date'], y=mentions_vol_df['smooth_mentions'], 
               name="Current Mentions", line=dict(color='blue', width=2.5)),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=mentions_vol_df['date'], y=mentions_vol_df['smooth_future_volume'], 
               name=f"Volume in {prediction_days} days", line=dict(color='red', width=2.5)),
    secondary_y=True,
)

# Update layout
fig.update_layout(
    title_text=f"Current Mentions vs {prediction_days}-day Future Volume for {ITEM}<br>({window_size}-day moving average)",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    margin=dict(l=50, r=50, t=80, b=50),
)

# Update axes
fig.update_yaxes(title_text="Number of Mentions", secondary_y=False, gridcolor='lightgrey')
fig.update_yaxes(title_text=f"Volume Traded in {prediction_days} days", secondary_y=True, gridcolor='lightgrey')
fig.update_xaxes(title_text="Date", gridcolor='lightgrey')

# Show the figure
fig.show()

# Save figures
save_dir = os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'figures', ITEM_SANITIZED)
os.makedirs(save_dir, exist_ok=True)

fig.write_image(
    os.path.join(save_dir, f'timeseries_mentions_vol_prediction_{prediction_days}days.png'),
    width=1920, height=1080, scale=2
)

fig.write_html(os.path.join(save_dir, f'timeseries_mentions_vol_prediction_{prediction_days}days.html'))
print(f"Saved to ./data/figures/{ITEM_SANITIZED}")

 ## timeseries - mentions against price plot

In [None]:
# Create a copy for this analysis
mentions_price_df = merged_df.copy()

# Set the prediction lag in days
prediction_days = 10  # Adjust this value as needed

# Shift price forward (future price)
mentions_price_df['future_price'] = mentions_price_df['price_usd'].shift(-prediction_days)

# Calculate smoothed versions
window_size = 20
mentions_price_df['smooth_mentions'] = mentions_price_df['num_mentions'].rolling(window=window_size).mean()
mentions_price_df['smooth_future_price'] = mentions_price_df['future_price'].rolling(window=window_size).mean()

# Remove NaN values created by the shift and smoothing
mentions_price_df = mentions_price_df.dropna(subset=['smooth_mentions', 'smooth_future_price'])

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add smoothed traces
fig.add_trace(
    go.Scatter(x=mentions_price_df['date'], y=mentions_price_df['smooth_mentions'], 
               name="Current Mentions", line=dict(color='blue', width=2.5)),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=mentions_price_df['date'], y=mentions_price_df['smooth_future_price'], 
               name=f"Price in {prediction_days} days", line=dict(color='green', width=2.5)),
    secondary_y=True,
)

# Update layout
fig.update_layout(
    title_text=f"Current Mentions vs {prediction_days}-day Future Price for {ITEM}<br>({window_size}-day moving average)",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    margin=dict(l=50, r=50, t=80, b=50),
)

# Update axes
fig.update_yaxes(title_text="Number of Mentions", secondary_y=False, gridcolor='lightgrey')
fig.update_yaxes(title_text=f"Price in {prediction_days} days (USD)", secondary_y=True, gridcolor='lightgrey')
fig.update_xaxes(title_text="Date", gridcolor='lightgrey')

# Show the figure
fig.show()

# Save figures
save_dir = os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'figures', ITEM_SANITIZED)
os.makedirs(save_dir, exist_ok=True)

fig.write_image(
    os.path.join(save_dir, f'timeseries_mentions_price_prediction_{prediction_days}days.png'),
    width=1920, height=1080, scale=2
)

fig.write_html(os.path.join(save_dir, f'timeseries_mentions_price_prediction_{prediction_days}days.html'))
print(f"Saved to ./data/figures/{ITEM_SANITIZED}")

## Windowed scatterplot - mentions & volume plot

In [None]:
# Define your date range
start_date = pd.to_datetime('2013-05-01')
end_date = pd.to_datetime('2023-06-01')
window = 10

# Filter dataframe for date range and create a copy
scatter_mentions_vol_df = merged_df[(merged_df['date'] >= start_date) & 
                                    (merged_df['date'] <= end_date)].copy()

# Calculate smoothed values
scatter_mentions_vol_df['smoothed_mentions'] = scatter_mentions_vol_df['num_mentions'].rolling(window=window).mean()
scatter_mentions_vol_df['smoothed_volume'] = scatter_mentions_vol_df['volume'].rolling(window=window).mean()

# Remove any NaN values before fitting
clean_df = scatter_mentions_vol_df.dropna(subset=['smoothed_volume', 'smoothed_mentions'])

# Create the scatter plot
fig = go.Figure()

# Add scatter points
fig.add_trace(
    go.Scatter(
        x=clean_df['smoothed_volume'],
        y=clean_df['smoothed_mentions'],
        mode='markers',
        marker=dict(size=8),
        name='Data Points',
        text=clean_df['date'],
        hovertemplate='Smoothed Volume: %{x}<br>Smoothed Mentions: %{y}<br>Date: %{text}<extra></extra>'
    )
)

# Add regression line if we have valid data
if len(clean_df) > 1:  # Need at least 2 points for a line
    z = np.polyfit(clean_df['smoothed_volume'], clean_df['smoothed_mentions'], 1)
    fig.add_trace(
        go.Scatter(
            x=clean_df['smoothed_volume'],
            y=z[0] * clean_df['smoothed_volume'] + z[1],
            mode='lines',
            name=f'Trend line',
            line=dict(color='red')
        )
    )
else:
    print("Not enough data points to fit a regression line.")

# Update layout
fig.update_layout(
    title=f"Smoothed Mentions vs. Smoothed Volume for {ITEM}<br>Date Range: {start_date.date()} to {end_date.date()}<br>Smoothing Window: {window} days",
    xaxis_title="Smoothed Volume Traded",
    yaxis_title="Smoothed Number of Mentions",
    height=600,
    width=800,
)

# Show the figure
fig.show()

# Save figures
save_dir = os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'figures', ITEM_SANITIZED)
os.makedirs(save_dir, exist_ok=True)

i = 1
while os.path.exists(os.path.join(save_dir, f'scatter_mentions_vol_{i}.png')):
    i += 1

fig.write_image(
    os.path.join(save_dir, f'scatter_mentions_vol_{i}.png'),
    width=1920, 
    height=1080,
    scale=2
)

## Windowed scatterplot - mentions & price plot

In [None]:
# Define your date range
start_date = pd.to_datetime('2013-05-01')
end_date = pd.to_datetime('2023-06-01')
window = 10

# Filter dataframe for date range and create a copy
scatter_mentions_price_df = merged_df[(merged_df['date'] >= start_date) & 
                                      (merged_df['date'] <= end_date)].copy()

# Calculate smoothed values
scatter_mentions_price_df['smoothed_mentions'] = scatter_mentions_price_df['num_mentions'].rolling(window=window).mean()
scatter_mentions_price_df['smoothed_price'] = scatter_mentions_price_df['price_usd'].rolling(window=window).mean()

# Remove any NaN values before fitting
clean_df = scatter_mentions_price_df.dropna(subset=['smoothed_price', 'smoothed_mentions'])

# Create the scatter plot
fig = go.Figure()

# Add scatter points
fig.add_trace(
    go.Scatter(
        x=clean_df['smoothed_price'],
        y=clean_df['smoothed_mentions'],
        mode='markers',
        marker=dict(size=8),
        name='Data Points',
        text=clean_df['date'],
        hovertemplate='Smoothed Price: %{x}<br>Smoothed Mentions: %{y}<br>Date: %{text}<extra></extra>'
    )
)

# Add regression line if we have valid data
if len(clean_df) > 1:  # Need at least 2 points for a line
    z = np.polyfit(clean_df['smoothed_price'], clean_df['smoothed_mentions'], 1)
    fig.add_trace(
        go.Scatter(
            x=clean_df['smoothed_price'],
            y=z[0] * clean_df['smoothed_price'] + z[1],
            mode='lines',
            name=f'Trend line',
            line=dict(color='red')
        )
    )
else:
    print("Not enough data points to fit a regression line.")

# Update layout
fig.update_layout(
    title=f"Smoothed Mentions vs. Smoothed Price for {ITEM}<br>Date Range: {start_date.date()} to {end_date.date()}<br>Smoothing Window: {window} days",
    xaxis_title="Smoothed Price (USD)",
    yaxis_title="Smoothed Number of Mentions",
    height=600,
    width=800,
)

# Show the figure
fig.show()

# Save figures
save_dir = os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'figures', ITEM_SANITIZED)
os.makedirs(save_dir, exist_ok=True)

i = 1
while os.path.exists(os.path.join(save_dir, f'scatter_mentions_price_{i}.png')):
    i += 1

fig.write_image(
    os.path.join(save_dir, f'scatter_mentions_price_{i}.png'),
    width=1920, 
    height=1080,
    scale=2
)

## Correlation stats

In [None]:
# In[11] - Update this section to preserve the original merged_df
# Create a copy for correlation analysis
correlation_df = merged_df.copy()
correlation_df = correlation_df.dropna(subset=['num_mentions', 'volume'])

print("Item name: " + ITEM)

# 1. Spearman's Correlation
spearman_corr, spearman_p = spearmanr(correlation_df['num_mentions'], correlation_df['volume'])
print(f"Spearman's correlation: {spearman_corr}, p-value: {spearman_p}")

# 2. Pearson's Correlation
pearson_corr, pearson_p = pearsonr(correlation_df['num_mentions'], correlation_df['volume'])
print(f"Pearson's correlation: {pearson_corr}, p-value: {pearson_p}")

# 3. Stationarity Test (ADF)
adf_result = adfuller(correlation_df['volume'])
print(f"ADF Statistic: {adf_result[0]}, p-value: {adf_result[1]}")

## Market vol against item vol

In [None]:
# Load market history data
market_df = pd.read_csv(os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'market_history', 'total_market_history.csv'))

market_df['date'] = pd.to_datetime(market_df['date'])
price_volume_df['date'] = pd.to_datetime(price_volume_df['date'])

# Merge market and item volume data
market_vol_df = pd.merge(
    price_volume_df[['date', 'volume']], 
    market_df[['date', 'volume']], 
    on='date', 
    how='outer',
    suffixes=('_item', '_market')
).sort_values('date')

# Forward fill missing values
market_vol_df = market_vol_df.ffill()

# Apply smoothing
window = 5
market_vol_df['smoothed_volume_market'] = market_vol_df['volume_market'].rolling(window=window).mean()
market_vol_df['smoothed_volume_item'] = market_vol_df['volume_item'].rolling(window=window).mean()

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(
        x=market_vol_df['date'], 
        y=market_vol_df['smoothed_volume_market'], 
        name="Market Volume", 
        line=dict(color='blue')
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=market_vol_df['date'], 
        y=market_vol_df['smoothed_volume_item'], 
        name="Item Volume", 
        line=dict(color='red')
    ),
    secondary_y=True,
)

# Update layout
fig.update_layout(
    title_text=f"Smoothed Market Volume vs Item Volume Over Time for {ITEM}",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    margin=dict(l=50, r=50, t=80, b=50),
)

# Update axes labels
fig.update_yaxes(title_text="Smoothed Market Volume", secondary_y=False, gridcolor='lightgrey')
fig.update_yaxes(title_text="Smoothed Item Volume", secondary_y=True, gridcolor='lightgrey')
fig.update_xaxes(title_text="Date", gridcolor='lightgrey')

# Show figure
fig.show()

# Save plots
save_dir = os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'figures', ITEM_SANITIZED)
os.makedirs(save_dir, exist_ok=True)

# Save static PNG only
fig.write_image(
    os.path.join(save_dir, f'timeseries_market_item_vol.png'),
    width=1920, 
    height=1080,
    scale=2
)

print(f"Saved to ./data/figures/{ITEM_SANITIZED}/timeseries_market_item_vol.png")