In [1]:
# Standard libraries
import sys
import os
from datetime import datetime

# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Statistical analysis
from scipy.stats import spearmanr, pearsonr
from statsmodels.tsa.stattools import adfuller
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from scipy import stats

# Custom imports
GAME_PRICE_PREDICTION_PATH = os.environ.get('GAME_PRICE_PREDICTION_PATH', '')
sys.path.insert(0, os.path.abspath(GAME_PRICE_PREDICTION_PATH))
from python_scripts.utilities.api_calls import get_cookie_from_blob, fetch_item_to_df, fetch_items
from python_scripts.sentiment_analysis.config import ITEM, ALL_POLARITY_FILENAME, ITEM_SANITIZED

c:\Users\Nukul\Desktop\Code\game_price_prediction\python_scripts\sentiment_analysis


### Prerequisites


In [2]:
# Make sure the correct item is imported from config.py
print(f"ITEM = {ITEM}")

ITEM = MAC-10 | Disco Tech (Field-Tested)


In [3]:
# filter_file_path = os.path.join(GAME_PRICE_PREDICTION_PATH, 'python_scripts', 'sentiment_analysis', 'filter_file.py')
# !python "{filter_file_path}"

# mention_counter_path = os.path.join(GAME_PRICE_PREDICTION_PATH, 'python_scripts', 'sentiment_analysis', 'mention_counter.py')
# !python "{mention_counter_path}"

# mention_data_combiner_path = os.path.join(GAME_PRICE_PREDICTION_PATH, 'python_scripts', 'sentiment_analysis', 'mention_data_combiner.py')
# !python "{mention_data_combiner_path}"

# vader_polarity_path = os.path.join(GAME_PRICE_PREDICTION_PATH, 'python_scripts', 'sentiment_analysis', 'vader_polarity.py')
# !python "{vader_polarity_path}"


### Fetch item history and import polarity data for item

In [4]:
daily_cookie = get_cookie_from_blob()
price_volume_df = fetch_item_to_df(ITEM, daily_cookie)

# load polarity data
polarity_path = os.path.join(
        GAME_PRICE_PREDICTION_PATH, 
        'data', 
        'reddit_data', 
        'polarity_all', 
        ALL_POLARITY_FILENAME
    )
polarity_df = pd.read_csv(polarity_path)
polarity_df['date'] = pd.to_datetime(polarity_df['date'])

## Merge dataframes, handle missing values and outliers

In [5]:
# Ensure df has a 'date' column
if 'date' not in price_volume_df.columns:
    price_volume_df = price_volume_df.reset_index()

# Merge polarity and price/volume data
merged_df = pd.merge(
    polarity_df, 
    price_volume_df[['date', 'price_usd', 'volume']], 
    on='date', 
    how='outer'
).sort_values('date')

# print("Missing values before handling:")
# print(merged_df.isnull().sum())

# Handle missing values more conservatively:
# 1. First handle price/volume data
merged_df['price_usd'] = merged_df['price_usd'].ffill(limit=2)  # only fill 2-day gaps
merged_df['volume'] = merged_df['volume'].fillna(0)  # no trades = 0 volume

# 2. For sentiment data, only interpolate very short gaps
sentiment_cols = ['compound', 'pos', 'neu', 'neg']
for col in sentiment_cols:
    merged_df[col] = merged_df[col].interpolate(method='linear', limit=1)  # only fill 1-day gaps

# 3. Drop remaining rows with missing values
merged_df = merged_df.dropna()

# print("\nMissing values after handling:")
# print(merged_df.isnull().sum())
# print(f"\nRows remaining: {len(merged_df)}")

# Remove outliers (optional)
# z_scores = np.abs(stats.zscore(merged_df[['compound', 'price_usd', 'volume']]))
# merged_df = merged_df[(z_scores < 3).all(axis=1)]

## timeseries - compound polarity against volume plot

In [6]:
# Create a copy for this analysis
polarity_vol_df = merged_df.copy()

# Set the prediction lag in days
prediction_days = 10  # Adjust this value as needed

# Shift volume forward (future volume)
polarity_vol_df['future_volume'] = polarity_vol_df['volume'].shift(-prediction_days)

# Calculate smoothened versions
window_size = 20
polarity_vol_df['smooth_compound'] = polarity_vol_df['compound'].rolling(window=window_size).mean()
polarity_vol_df['smooth_future_volume'] = polarity_vol_df['future_volume'].rolling(window=window_size).mean()

# Remove NaN values created by the shift and smoothing
polarity_vol_df = polarity_vol_df.dropna(subset=['smooth_compound', 'smooth_future_volume'])

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add smoothened traces
fig.add_trace(
    go.Scatter(x=polarity_vol_df['date'], y=polarity_vol_df['smooth_compound'], 
               name="Current Sentiment", line=dict(color='blue', width=2.5)),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=polarity_vol_df['date'], y=polarity_vol_df['smooth_future_volume'], 
               name=f"Volume in {prediction_days} days", line=dict(color='red', width=2.5)),
    secondary_y=True,
)

# Update layout
fig.update_layout(
    title_text=f"Current Sentiment vs {prediction_days}-day Future Volume for {ITEM}<br>({window_size}-day moving average)",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    margin=dict(l=50, r=50, t=80, b=50),
)

# Update axes
fig.update_yaxes(title_text="Sentiment Score (Compound)", secondary_y=False, gridcolor='lightgrey')
fig.update_yaxes(title_text=f"Volume Traded in {prediction_days} days", secondary_y=True, gridcolor='lightgrey')
fig.update_xaxes(title_text="Date", gridcolor='lightgrey')

# Show the figure
fig.show()

# Save figures
save_dir = os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'figures', ITEM_SANITIZED)
os.makedirs(save_dir, exist_ok=True)

fig.write_image(
    os.path.join(save_dir, f'timeseries_pol_vol_prediction_{prediction_days}days.png'),
    width=1920, height=1080, scale=2
)

fig.write_html(os.path.join(save_dir, f'timeseries_pol_vol_prediction_{prediction_days}days.html'))
print(f"Saved to ./data/figures/{ITEM_SANITIZED}")

Saved to ./data/figures/mac-10___disco_tech_(field-tested)


## timeseries - compound polarity against price plot

In [7]:
# Create a copy for this analysis
polarity_price_df = merged_df.copy()

# Set the prediction lag in days
prediction_days = 1  # Adjust this value as needed

# Shift price forward (future price)
polarity_price_df['future_price'] = polarity_price_df['price_usd'].shift(-prediction_days)

# Calculate smoothened versions
window_size = 10
polarity_price_df['smooth_compound'] = polarity_price_df['compound'].rolling(window=window_size).mean()
polarity_price_df['smooth_future_price'] = polarity_price_df['future_price'].rolling(window=window_size).mean()

# Remove NaN values created by the shift and smoothing
polarity_price_df = polarity_price_df.dropna(subset=['smooth_compound', 'smooth_future_price'])

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add smoothened traces
fig.add_trace(
    go.Scatter(x=polarity_price_df['date'], y=polarity_price_df['smooth_compound'], 
               name="Current Sentiment", line=dict(color='blue', width=2.5)),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=polarity_price_df['date'], y=polarity_price_df['smooth_future_price'], 
               name=f"Price in {prediction_days} days", line=dict(color='red', width=2.5)),
    secondary_y=True,
)

# Update layout
fig.update_layout(
    title_text=f"Current Sentiment vs {prediction_days}-day Future Price for {ITEM}<br>({window_size}-day moving average)",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    margin=dict(l=50, r=50, t=80, b=50),
)

# Update axes
fig.update_yaxes(title_text="Sentiment Score (Compound)", secondary_y=False, gridcolor='lightgrey')
fig.update_yaxes(title_text=f"Price (USD) in {prediction_days} days", secondary_y=True, gridcolor='lightgrey')
fig.update_xaxes(title_text="Date", gridcolor='lightgrey')

# Show the figure
fig.show()

# Save figures
save_dir = os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'figures', ITEM_SANITIZED)
os.makedirs(save_dir, exist_ok=True)

fig.write_image(
    os.path.join(save_dir, f'timeseries_pol_price_prediction_{prediction_days}days.png'),
    width=1920, height=1080, scale=2
)

fig.write_html(os.path.join(save_dir, f'timeseries_pol_price_prediction_{prediction_days}days.html'))

## Windowed scatterplot - compound polarity & volume plot

In [15]:
# Define your date range
start_date = pd.to_datetime('2013-05-01')
end_date = pd.to_datetime('2023-06-01')
window = 10

# Filter dataframe for date range and create a copy
scatter_vol_df = merged_df[(merged_df['date'] >= start_date) & 
                          (merged_df['date'] <= end_date)].copy()

# Calculate smoothed values
scatter_vol_df['smoothed_compound'] = scatter_vol_df['compound'].rolling(window=window).mean()
scatter_vol_df['smoothed_volume'] = scatter_vol_df['volume'].rolling(window=window).mean()

# Remove any NaN values before fitting
clean_df = scatter_vol_df.dropna(subset=['smoothed_volume', 'smoothed_compound'])

# Create the scatter plot
fig = go.Figure()

# Add scatter points
fig.add_trace(
    go.Scatter(
        x=clean_df['smoothed_volume'],
        y=clean_df['smoothed_compound'],
        mode='markers',
        marker=dict(size=8),
        name='Data Points',
        text=clean_df['date'],
        hovertemplate='Smoothed Volume: %{x}<br>Smoothed Sentiment: %{y}<br>Date: %{text}<extra></extra>'
    )
)

# Add regression line if we have valid data
if len(clean_df) > 1:  # Need at least 2 points for a line
    z = np.polyfit(clean_df['smoothed_volume'], clean_df['smoothed_compound'], 1)
    fig.add_trace(
        go.Scatter(
            x=clean_df['smoothed_volume'],
            y=z[0] * clean_df['smoothed_volume'] + z[1],
            mode='lines',
            name=f'Trend line',
            line=dict(color='red')
        )
    )
else:
    print("Not enough data points to fit a regression line.")


# Update layout
fig.update_layout(
    title=f"Smoothed Sentiment vs. Smoothed Volume for {ITEM}<br>Date Range: {start_date.date()} to {end_date.date()}<br>Smoothing Window: {window} days",
    xaxis_title="Smoothed Volume Traded",
    yaxis_title="Smoothed Sentiment Score (Compound)",
    height=600,
    width=800,
)

# Show the figure
fig.show()

# Save figures
save_dir = os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'figures', ITEM_SANITIZED)
os.makedirs(save_dir, exist_ok=True)

i = 1
while os.path.exists(os.path.join(save_dir, f'scatter_pol_vol_{i}.png')):
    i += 1

fig.write_image(
    os.path.join(save_dir, f'scatter_pol_vol_{i}.png'),
    width=1920, 
    height=1080,
    scale=2
)

## Windowed scatterplot - compound polarity & price plot

In [9]:
# Define your date range
start_date = pd.to_datetime('2016-01-01')
end_date = pd.to_datetime('2023-02-01')
window = 10

# Filter dataframe for date range and create a copy
scatter_price_df = merged_df[(merged_df['date'] >= start_date) & 
                            (merged_df['date'] <= end_date)].copy()

# Calculate smoothed values
scatter_price_df['smoothed_compound'] = scatter_price_df['compound'].rolling(window=window).mean()
scatter_price_df['smoothed_price'] = scatter_price_df['price_usd'].rolling(window=window).mean()

# Remove any NaN values before fitting
clean_df = scatter_price_df.dropna(subset=['smoothed_price', 'smoothed_compound'])

# Create the scatter plot
fig = go.Figure()

# Add scatter points
fig.add_trace(
    go.Scatter(
        x=clean_df['smoothed_price'],
        y=clean_df['smoothed_compound'],
        mode='markers',
        marker=dict(size=8),
        name='Data Points',
        text=clean_df['date'],
        hovertemplate='Smoothed Price: %{x}<br>Smoothed Sentiment: %{y}<br>Date: %{text}<extra></extra>'
    )
)

# Add regression line if we have valid data
if len(clean_df) > 1:  # Need at least 2 points for a line
    z = np.polyfit(clean_df['smoothed_price'], clean_df['smoothed_compound'], 1)
    fig.add_trace(
        go.Scatter(
            x=clean_df['smoothed_price'],
            y=z[0] * clean_df['smoothed_price'] + z[1],
            mode='lines',
            name=f'Trend line',
            line=dict(color='red')
        )
    )

# Update layout
fig.update_layout(
    title=f"Smoothed Sentiment vs. Smoothed Price for {ITEM}<br>Date Range: {start_date.date()} to {end_date.date()}<br>Smoothing Window: {window} days",
    xaxis_title="Smoothed Price (USD)",
    yaxis_title="Smoothed Sentiment Score (Compound)",
    height=600,
    width=800,
)

# Show the figure
fig.show()

# Save figures
save_dir = os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'figures', ITEM_SANITIZED)
os.makedirs(save_dir, exist_ok=True)

i = 1
while os.path.exists(os.path.join(save_dir, f'scatter_pol_price_{i}.png')):
    i += 1

fig.write_image(
    os.path.join(save_dir, f'scatter_pol_price_{i}.png'),
    width=1920, 
    height=1080,
    scale=2
)

### Calculate stats for compound polarity against volume

In [10]:
# Create a copy for statistical analysis
stats_vol_df = merged_df.copy()

# Calculate smoothened versions with 30-day window
window_size = 30
stats_vol_df['smooth_compound'] = stats_vol_df['compound'].rolling(window=window_size).mean()
stats_vol_df['smooth_volume'] = stats_vol_df['volume'].rolling(window=window_size).mean()
stats_vol_df = stats_vol_df.dropna()

print("Item name: " + ITEM)

# 1. Test for normality first (this affects which correlation test to use)
from scipy.stats import shapiro

print("\nNormality Tests:")
_, p_value_compound = shapiro(stats_vol_df['smooth_compound'])
print(f"Compound Sentiment - Shapiro p-value: {p_value_compound}")
_, p_value_volume = shapiro(stats_vol_df['smooth_volume'])
print(f"Volume - Shapiro p-value: {p_value_volume}")

print("\nCorrelation Analysis:")
if p_value_compound < 0.05 or p_value_volume < 0.05:
    # If not normal, use Spearman
    spearman_corr, spearman_p = spearmanr(stats_vol_df['smooth_compound'], stats_vol_df['smooth_volume'])
    print(f"Using Spearman (non-parametric) due to non-normal distribution")
    print(f"Correlation: {spearman_corr:.3f}, p-value: {spearman_p:.3f}")
else:
    # If normal, use Pearson
    pearson_corr, pearson_p = pearsonr(stats_vol_df['smooth_compound'], stats_vol_df['smooth_volume'])
    print(f"Using Pearson (parametric) as data is normally distributed")
    print(f"Correlation: {pearson_corr:.3f}, p-value: {pearson_p:.3f}")

# 3. Stationarity Test with interpretation
adf_volume = adfuller(stats_vol_df['smooth_volume'])
adf_compound = adfuller(stats_vol_df['smooth_compound'])

print("\nStationarity Tests:")
print("Volume:")
print(f"ADF Statistic: {adf_volume[0]:.3f}")
print(f"p-value: {adf_volume[1]:.3f}")
print("Stationary" if adf_volume[1] < 0.05 else "Non-stationary")

print("\nCompound Sentiment:")
print(f"ADF Statistic: {adf_compound[0]:.3f}")
print(f"p-value: {adf_compound[1]:.3f}")
print("Stationary" if adf_compound[1] < 0.05 else "Non-stationary")

# 4. Effect Size (Cohen's d)
from scipy import stats

def cohens_d(x, y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    return (np.mean(x) - np.mean(y)) / np.sqrt(((nx-1)*np.std(x, ddof=1) ** 2 + (ny-1)*np.std(y, ddof=1) ** 2) / dof)

print("\nEffect Size:")
d = cohens_d(stats_vol_df['smooth_compound'], stats.zscore(stats_vol_df['smooth_volume']))
print(f"Cohen's d: {d:.3f}")
print("Effect size interpretation:")
print("Small" if abs(d) < 0.5 else "Medium" if abs(d) < 0.8 else "Large")

Item name: MAC-10 | Disco Tech (Field-Tested)

Normality Tests:
Compound Sentiment - Shapiro p-value: 3.6591577232129e-19
Volume - Shapiro p-value: 1.3487647234139403e-23

Correlation Analysis:
Using Spearman (non-parametric) due to non-normal distribution
Correlation: -0.266, p-value: 0.000

Stationarity Tests:
Volume:
ADF Statistic: -2.231
p-value: 0.195
Non-stationary

Compound Sentiment:
ADF Statistic: -2.066
p-value: 0.259
Non-stationary

Effect Size:
Cohen's d: -0.233
Effect size interpretation:
Small


### Calculate stats for compound polarity against price

In [11]:
# Create a copy for statistical analysis
stats_price_df = merged_df.copy()

# Calculate smoothened versions with 30-day window
window_size = 30
stats_price_df['smooth_compound'] = stats_price_df['compound'].rolling(window=window_size).mean()
stats_price_df['smooth_price'] = stats_price_df['price_usd'].rolling(window=window_size).mean()
stats_price_df = stats_price_df.dropna()

print("Item name: " + ITEM)

# 1. Test for normality first (this affects which correlation test to use)
from scipy.stats import shapiro

print("\nNormality Tests:")
_, p_value_compound = shapiro(stats_price_df['smooth_compound'])
print(f"Compound Sentiment - Shapiro p-value: {p_value_compound}")
_, p_value_price = shapiro(stats_price_df['smooth_price'])
print(f"Price - Shapiro p-value: {p_value_price}")

print("\nCorrelation Analysis:")
if p_value_compound < 0.05 or p_value_price < 0.05:
    # If not normal, use Spearman
    spearman_corr, spearman_p = spearmanr(stats_price_df['smooth_compound'], stats_price_df['smooth_price'])
    print(f"Using Spearman (non-parametric) due to non-normal distribution")
    print(f"Correlation: {spearman_corr:.3f}, p-value: {spearman_p:.3f}")
else:
    # If normal, use Pearson
    pearson_corr, pearson_p = pearsonr(stats_price_df['smooth_compound'], stats_price_df['smooth_price'])
    print(f"Using Pearson (parametric) as data is normally distributed")
    print(f"Correlation: {pearson_corr:.3f}, p-value: {pearson_p:.3f}")

# 3. Stationarity Test with interpretation
adf_price = adfuller(stats_price_df['smooth_price'])
adf_compound = adfuller(stats_price_df['smooth_compound'])

print("\nStationarity Tests:")
print("Price:")
print(f"ADF Statistic: {adf_price[0]:.3f}")
print(f"p-value: {adf_price[1]:.3f}")
print("Stationary" if adf_price[1] < 0.05 else "Non-stationary")

print("\nCompound Sentiment:")
print(f"ADF Statistic: {adf_compound[0]:.3f}")
print(f"p-value: {adf_compound[1]:.3f}")
print("Stationary" if adf_compound[1] < 0.05 else "Non-stationary")

# 4. Effect Size (Cohen's d)
from scipy import stats

def cohens_d(x, y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    return (np.mean(x) - np.mean(y)) / np.sqrt(((nx-1)*np.std(x, ddof=1) ** 2 + (ny-1)*np.std(y, ddof=1) ** 2) / dof)

print("\nEffect Size:")
d = cohens_d(stats_price_df['smooth_compound'], stats.zscore(stats_price_df['smooth_price']))
print(f"Cohen's d: {d:.3f}")
print("Effect size interpretation:")
print("Small" if abs(d) < 0.5 else "Medium" if abs(d) < 0.8 else "Large")

Item name: MAC-10 | Disco Tech (Field-Tested)

Normality Tests:
Compound Sentiment - Shapiro p-value: 3.6591577232129e-19
Price - Shapiro p-value: 5.186577449581044e-29

Correlation Analysis:
Using Spearman (non-parametric) due to non-normal distribution
Correlation: 0.522, p-value: 0.000

Stationarity Tests:
Price:
ADF Statistic: -10.049
p-value: 0.000
Stationary

Compound Sentiment:
ADF Statistic: -2.066
p-value: 0.259
Non-stationary

Effect Size:
Cohen's d: -0.233
Effect size interpretation:
Small
