In [1]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import sys
import os
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
from scipy.stats import spearmanr, pearsonr
from statsmodels.tsa.stattools import adfuller
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARIMA

GAME_PRICE_PREDICTION_PATH = os.environ.get('GAME_PRICE_PREDICTION_PATH', '')
sys.path.insert(0, os.path.abspath(GAME_PRICE_PREDICTION_PATH))

cwd = GAME_PRICE_PREDICTION_PATH

from python_scripts.utilities.api_calls import get_cookie_from_blob, fetch_item_to_df, fetch_items
from python_scripts.sentiment_analysis.config import ITEM, ALL_POLARITY_FILENAME, ITEM_SANITIZED

""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" 
- Takes data from mention_data 
- and a range of others; polarity_data, or fetches price history
- Plots it sexily
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" 

c:\Users\Nukul\Desktop\Code\game_price_prediction\python_scripts\sentiment_analysis


'"" \n- Takes data from mention_data \n- and a range of others; polarity_data, or fetches price history\n- Plots it sexily\n'

### Prerequisites


In [2]:
# Make sure the correct item is imported from config.py

print(f"ITEM = {ITEM}")

ITEM = M4A1-S | Golden Coil (Factory New)


In [3]:
# filter_file_path = os.path.join(GAME_PRICE_PREDICTION_PATH, 'python_scripts', 'sentiment_analysis', 'filter_file.py')
# !python "{filter_file_path}"

# mention_counter_path = os.path.join(GAME_PRICE_PREDICTION_PATH, 'python_scripts', 'sentiment_analysis', 'mention_counter.py')
# !python "{mention_counter_path}"

# mention_data_combiner_path = os.path.join(GAME_PRICE_PREDICTION_PATH, 'python_scripts', 'sentiment_analysis', 'mention_data_combiner.py')
# !python "{mention_data_combiner_path}"

# vader_polarity_path = os.path.join(GAME_PRICE_PREDICTION_PATH, 'python_scripts', 'sentiment_analysis', 'vader_polarity.py')
# !python "{vader_polarity_path}"


### Fetch price and volume history

In [4]:
### FETCHING ITEM

# Fetch price and volume data FOR ITEM CHOSEN
dailyCookie = get_cookie_from_blob()
items = fetch_items()
df = fetch_item_to_df(ITEM, dailyCookie)
print(f"Data for {ITEM} fetched")


Data for M4A1-S | Golden Coil (Factory New) fetched


## timeseries - compound polarity against volume plot

In [12]:
# Load polarity data
polarity_df = pd.read_csv(os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'reddit_data', 'polarity_all', ALL_POLARITY_FILENAME))
polarity_df['date'] = pd.to_datetime(polarity_df['date'])

# Ensure df has a 'date' column
if 'date' not in df.columns:
    df = df.reset_index()

# Set the prediction lag in days (how many days ahead to predict)
prediction_days = 18  # Adjust this value as needed

# Merge polarity and volume data
merged_df = pd.merge(polarity_df, df[['date', 'volume']], on='date', how='outer').sort_values('date')
merged_df = merged_df.ffill()

# Shift volume forward (future volume)
merged_df['future_volume'] = merged_df['volume'].shift(-prediction_days)

# Calculate smoothened versions
window_size = 10
merged_df['smooth_compound'] = merged_df['compound'].rolling(window=window_size).mean()
merged_df['smooth_future_volume'] = merged_df['future_volume'].rolling(window=window_size).mean()

# Remove NaN values created by the shift
merged_df = merged_df.dropna(subset=['smooth_compound', 'smooth_future_volume'])

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add smoothened traces
fig.add_trace(
    go.Scatter(x=merged_df['date'], y=merged_df['smooth_compound'], 
               name="Current Sentiment", line=dict(color='blue', width=2.5)),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=merged_df['date'], y=merged_df['smooth_future_volume'], 
               name=f"Volume in {prediction_days} days", line=dict(color='red', width=2.5)),
    secondary_y=True,
)

# Update layout for better readability
fig.update_layout(
    title_text=f"Current Sentiment vs {prediction_days}-day Future Volume for {ITEM}<br>({window_size}-day moving average)",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    margin=dict(l=50, r=50, t=80, b=50),
)

# Update y-axes
fig.update_yaxes(title_text="Sentiment Score (Compound)", secondary_y=False, gridcolor='lightgrey')
fig.update_yaxes(title_text=f"Volume Traded in {prediction_days} days", secondary_y=True, gridcolor='lightgrey')

# Update x-axis
fig.update_xaxes(title_text="Date", gridcolor='lightgrey')

# Show the figure
fig.show()

# Save figures
save_dir = os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'figures', ITEM_SANITIZED)
os.makedirs(save_dir, exist_ok=True)

# Save static PNG
fig.write_image(
    os.path.join(save_dir, f'timeseries_pol_vol_prediction_{prediction_days}days.png'),
    width=1920, 
    height=1080,
    scale=2
)

# Save interactive HTML
fig.write_html(os.path.join(save_dir, f'timeseries_pol_vol_prediction_{prediction_days}days.html'))

print(f"Saved to ./data/figures/{ITEM_SANITIZED}")

Saved to ./data/figures/m4a1-s___golden_coil_(factory_new)


## timeseries - mentions against price plot

In [6]:
# Load polarity data
polarity_df = pd.read_csv(os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'reddit_data', 'polarity_all', ALL_POLARITY_FILENAME))
polarity_df['date'] = pd.to_datetime(polarity_df['date'])

# Ensure df has a 'date' column
if 'date' not in df.columns:
    df = df.reset_index()

# Set the prediction lag in days (how many days ahead to predict)
prediction_days = 14  # Adjust this value as needed

# Merge polarity and price data
merged_df = pd.merge(polarity_df, df[['date', 'price_usd']], on='date', how='outer').sort_values('date')
merged_df = merged_df.ffill()

# Shift price forward (future price)
merged_df['future_price'] = merged_df['price_usd'].shift(-prediction_days)

# Calculate smoothened versions
window_size = 10
merged_df['smooth_compound'] = merged_df['compound'].rolling(window=window_size).mean()
merged_df['smooth_future_price'] = merged_df['future_price'].rolling(window=window_size).mean()

# Remove NaN values created by the shift
merged_df = merged_df.dropna(subset=['smooth_compound', 'smooth_future_price'])

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add smoothened traces
fig.add_trace(
    go.Scatter(x=merged_df['date'], y=merged_df['smooth_compound'], 
               name="Current Sentiment", line=dict(color='blue', width=2.5)),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=merged_df['date'], y=merged_df['smooth_future_price'], 
               name=f"Price in {prediction_days} days", line=dict(color='red', width=2.5)),
    secondary_y=True,
)

# Update layout for better readability
fig.update_layout(
    title_text=f"Current Sentiment vs {prediction_days}-day Future Price for {ITEM}<br>({window_size}-day moving average)",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    margin=dict(l=50, r=50, t=80, b=50),
)

# Update y-axes
fig.update_yaxes(title_text="Sentiment Score (Compound)", secondary_y=False, gridcolor='lightgrey')
fig.update_yaxes(title_text=f"Price (USD) in {prediction_days} days", secondary_y=True, gridcolor='lightgrey')

# Update x-axis
fig.update_xaxes(title_text="Date", gridcolor='lightgrey')

# Show the figure
fig.show()

# Save figures
save_dir = os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'figures', ITEM_SANITIZED)
os.makedirs(save_dir, exist_ok=True)

# Save static PNG
fig.write_image(
    os.path.join(save_dir, f'timeseries_pol_price_prediction_{prediction_days}days.png'),
    width=1920, 
    height=1080,
    scale=2
)

# Save interactive HTML
fig.write_html(os.path.join(save_dir, f'timeseries_pol_price_prediction_{prediction_days}days.html'))

print(f"Saved to ./data/figures/{ITEM_SANITIZED}")

Saved to ./data/figures/m4a1-s___golden_coil_(factory_new)


## weekly median

In [7]:
# Load polarity data
polarity_df = pd.read_csv(os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'reddit_data', 'polarity_all', ALL_POLARITY_FILENAME))
polarity_df['date'] = pd.to_datetime(polarity_df['date'])

# Ensure df has a 'date' column
if 'date' not in df.columns:
    df = df.reset_index()

# Set the prediction lag in weeks
prediction_weeks = 3  # Adjust this value as needed

# Merge polarity and price data
merged_df = pd.merge(polarity_df, df[['date', 'price_usd']], on='date', how='outer').sort_values('date')
merged_df = merged_df.ffill()

# Add week start date column for grouping
merged_df['week'] = merged_df['date'].dt.to_period('W').astype(str)

# Create future price column before aggregation
merged_df['future_price'] = merged_df['price_usd'].shift(-prediction_weeks * 7)

# Aggregate by week using median
weekly_df = merged_df.groupby('week').agg({
    'date': 'first',  # Keep first date of week
    'compound': 'median',
    'future_price': 'median'
}).reset_index()

# Sort by date
weekly_df = weekly_df.sort_values('date')

# Remove NaN values created by the shift
weekly_df = weekly_df.dropna(subset=['compound', 'future_price'])

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=weekly_df['date'], y=weekly_df['compound'], 
               name="Weekly Sentiment", line=dict(color='blue', width=2.5)),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=weekly_df['date'], y=weekly_df['future_price'], 
               name=f"Price in {prediction_weeks} weeks", line=dict(color='red', width=2.5)),
    secondary_y=True,
)

# Update layout for better readability
fig.update_layout(
    title_text=f"Weekly Sentiment vs {prediction_weeks}-week Future Price for {ITEM}<br>(Weekly Median Values)",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    margin=dict(l=50, r=50, t=80, b=50),
)

# Update y-axes
fig.update_yaxes(title_text="Weekly Median Sentiment Score", secondary_y=False, gridcolor='lightgrey')
fig.update_yaxes(title_text=f"Median Price (USD) in {prediction_weeks} weeks", secondary_y=True, gridcolor='lightgrey')

# Update x-axis
fig.update_xaxes(title_text="Date", gridcolor='lightgrey')

# Show the figure
fig.show()

# Save figures
save_dir = os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'figures', ITEM_SANITIZED)
os.makedirs(save_dir, exist_ok=True)

# Save static PNG
fig.write_image(
    os.path.join(save_dir, f'timeseries_pol_price_prediction_{prediction_weeks}weeks_weekly_median.png'),
    width=1920, 
    height=1080,
    scale=2
)

# Save interactive HTML
fig.write_html(os.path.join(save_dir, f'timeseries_pol_price_prediction_{prediction_weeks}weeks_weekly_median.html'))

print(f"Saved to ./data/figures/{ITEM_SANITIZED}")

Saved to ./data/figures/m4a1-s___golden_coil_(factory_new)


## Windowed scatterplot - compound polarity & volume plot

In [8]:
# Ensure necessary imports are present
import plotly.graph_objects as go
import numpy as np

# Define your date range
# YYYY-MM-DD
start_date = pd.to_datetime('2019-01-01')
end_date = pd.to_datetime('2019-02-01')

window = 20

# Load polarity data
polarity_df = pd.read_csv(os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'reddit_data', 'polarity_all', ALL_POLARITY_FILENAME))
polarity_df['date'] = pd.to_datetime(polarity_df['date'])

# Ensure df has a 'date' column
if 'date' not in df.columns:
    df = df.reset_index()

# Merge polarity and volume data
merged_df = pd.merge(polarity_df, df[['date', 'volume']], on='date', how='outer').sort_values('date')
merged_df = merged_df.ffill()

# Filter dataframe for date range and create a copy
date_filtered_df = merged_df[(merged_df['date'] >= start_date) & 
                            (merged_df['date'] <= end_date)].copy()

# Calculate smoothed values on filtered data using .loc
date_filtered_df.loc[:, 'smoothed_compound'] = date_filtered_df['compound'].rolling(window=window).mean()
date_filtered_df.loc[:, 'smoothed_volume'] = date_filtered_df['volume'].rolling(window=window).mean()

# Remove any NaN values before fitting
clean_df = date_filtered_df.dropna(subset=['smoothed_volume', 'smoothed_compound'])

# Create the scatter plot
fig = go.Figure()

# Add scatter points
fig.add_trace(
    go.Scatter(
        x=clean_df['smoothed_volume'],
        y=clean_df['smoothed_compound'],
        mode='markers',
        marker=dict(size=8),
        name='Data Points',
        text=clean_df['date'],
        hovertemplate='Smoothed Volume: %{x}<br>Smoothed Sentiment: %{y}<br>Date: %{text}<extra></extra>'
    )
)

# Add regression line only if we have valid data
if len(clean_df) > 1:  # Need at least 2 points for a line
    try:
        z = np.polyfit(clean_df['smoothed_volume'], clean_df['smoothed_compound'], 1)
        fig.add_trace(
            go.Scatter(
                x=clean_df['smoothed_volume'],
                y=z[0] * clean_df['smoothed_volume'] + z[1],
                mode='lines',
                name=f'Trend line',
                line=dict(color='red')
            )
        )
    except np.linalg.LinAlgError:
        print("Could not calculate regression line due to data issues")

# Update layout for better readability
fig.update_layout(
    title=f"Smoothed Sentiment vs. Smoothed Volume for {ITEM}<br>Date Range: {start_date.date()} to {end_date.date()}<br>Smoothing Window: {window} days",
    xaxis_title="Smoothed Volume Traded",
    yaxis_title="Smoothed Sentiment Score (Compound)",
    height=600,
    width=800,
)

# Show the figure
fig.show()

# saving
save_dir = os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'figures', ITEM_SANITIZED)
os.makedirs(save_dir, exist_ok=True)

i = 1
while os.path.exists(os.path.join(save_dir, f'scatter_pol_vol_{i}.png')):
    i += 1

# Save static PNG
fig.write_image(
    os.path.join(save_dir, f'scatter_pol_vol_{i}.png'),
    width=1920, 
    height=1080,
    scale=2
)

print(f"Saved to ./data/figures/{ITEM_SANITIZED}/scatter_pol_vol_{i}.png")

Saved to ./data/figures/m4a1-s___golden_coil_(factory_new)/scatter_pol_vol_1.png


## Windowed scatterplot - compound polarity & volume plot

In [9]:
# Ensure necessary imports are present
import plotly.graph_objects as go
import numpy as np

# Define your date range
# YYYY-MM-DD
start_date = pd.to_datetime('2020-05-01')
end_date = pd.to_datetime('2020-06-01')

window = 3

# Load polarity data
polarity_df = pd.read_csv(os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'reddit_data', 'polarity_all', ALL_POLARITY_FILENAME))
polarity_df['date'] = pd.to_datetime(polarity_df['date'])

# Ensure df has a 'date' column
if 'date' not in df.columns:
    df = df.reset_index()

# Merge polarity and price data
merged_df = pd.merge(polarity_df, df[['date', 'price_usd']], on='date', how='outer').sort_values('date')
merged_df = merged_df.ffill()

# Filter dataframe for date range and create a copy
date_filtered_df = merged_df[(merged_df['date'] >= start_date) & 
                            (merged_df['date'] <= end_date)].copy()

# Calculate smoothed values on filtered data using .loc
date_filtered_df.loc[:, 'smoothed_compound'] = date_filtered_df['compound'].rolling(window=window).mean()
date_filtered_df.loc[:, 'smoothed_price'] = date_filtered_df['price_usd'].rolling(window=window).mean()

# Remove any NaN values before fitting
clean_df = date_filtered_df.dropna(subset=['smoothed_price', 'smoothed_compound'])

# Create the scatter plot
fig = go.Figure()

# Add scatter points
fig.add_trace(
    go.Scatter(
        x=clean_df['smoothed_price'],
        y=clean_df['smoothed_compound'],
        mode='markers',
        marker=dict(size=8),
        name='Data Points',
        text=clean_df['date'],
        hovertemplate='Smoothed Price: %{x}<br>Smoothed Sentiment: %{y}<br>Date: %{text}<extra></extra>'
    )
)

# Add regression line only if we have valid data
if len(clean_df) > 1:  # Need at least 2 points for a line
    try:
        z = np.polyfit(clean_df['smoothed_price'], clean_df['smoothed_compound'], 1)
        fig.add_trace(
            go.Scatter(
                x=clean_df['smoothed_price'],
                y=z[0] * clean_df['smoothed_price'] + z[1],
                mode='lines',
                name=f'Trend line',
                line=dict(color='red')
            )
        )
    except np.linalg.LinAlgError:
        print("Could not calculate regression line due to data issues")

# Update layout for better readability
fig.update_layout(
    title=f"Smoothed Sentiment vs. Smoothed Price for {ITEM}<br>Date Range: {start_date.date()} to {end_date.date()}<br>Smoothing Window: {window} days",
    xaxis_title="Smoothed Price (USD)",
    yaxis_title="Smoothed Sentiment Score (Compound)",
    height=600,
    width=800,
)

# Show the figure
fig.show()

# saving
save_dir = os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'figures', ITEM_SANITIZED)
os.makedirs(save_dir, exist_ok=True)

i = 1
while os.path.exists(os.path.join(save_dir, f'scatter_pol_price_{i}.png')):
    i += 1

# Save static PNG
fig.write_image(
    os.path.join(save_dir, f'scatter_pol_price_{i}.png'),
    width=1920, 
    height=1080,
    scale=2
)

print(f"Saved to ./data/figures/{ITEM_SANITIZED}/scatter_pol_price_{i}.png")

Saved to ./data/figures/m4a1-s___golden_coil_(factory_new)/scatter_pol_price_1.png


### Calculate stats for compound polarity against volume

In [10]:
# Load and prepare data as before
polarity_df = pd.read_csv(os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'reddit_data', 'polarity_all', ALL_POLARITY_FILENAME))
polarity_df['date'] = pd.to_datetime(polarity_df['date'])

if 'date' not in df.columns:
    df = df.reset_index()

merged_df = pd.merge(polarity_df, df[['date', 'volume']], on='date', how='outer').sort_values('date')
merged_df = merged_df.ffill()

# Calculate smoothened versions with 30-day window
window_size = 30
merged_df['smooth_compound'] = merged_df['compound'].rolling(window=window_size).mean()
merged_df['smooth_volume'] = merged_df['volume'].rolling(window=window_size).mean()

merged_df = merged_df.dropna()

print("Item name: " + ITEM)

# 1. Spearman's Correlation with smoothened data
spearman_corr, spearman_p = spearmanr(merged_df['smooth_compound'], merged_df['smooth_volume'])
print(f"Spearman's correlation (smoothened): {spearman_corr}, p-value: {spearman_p}")

# 2. Pearson's Correlation with smoothened data
pearson_corr, pearson_p = pearsonr(merged_df['smooth_compound'], merged_df['smooth_volume'])
print(f"Pearson's correlation (smoothened): {pearson_corr}, p-value: {pearson_p}")

# 3. Stationarity Test (ADF) for both smoothened series
adf_volume = adfuller(merged_df['smooth_volume'])
adf_compound = adfuller(merged_df['smooth_compound'])

print("\nStationarity Tests (smoothened data):")
print(f"Smoothed Volume - ADF Statistic: {adf_volume[0]}, p-value: {adf_volume[1]}")
print(f"Smoothed Compound - ADF Statistic: {adf_compound[0]}, p-value: {adf_compound[1]}")

Item name: M4A1-S | Golden Coil (Factory New)
Spearman's correlation (smoothened): -0.42433310020687054, p-value: 1.3380931835888496e-144
Pearson's correlation (smoothened): -0.44992545979893733, p-value: 1.808674586249922e-164

Stationarity Tests (smoothened data):
Smoothed Volume - ADF Statistic: -1.9974396942718329, p-value: 0.28769683116240313
Smoothed Compound - ADF Statistic: -2.9423862191107277, p-value: 0.040636429648983015


### Calculate stats for compound polarity against price

In [11]:
# Load and prepare data as before
polarity_df = pd.read_csv(os.path.join(GAME_PRICE_PREDICTION_PATH, 'data', 'reddit_data', 'polarity_all', ALL_POLARITY_FILENAME))
polarity_df['date'] = pd.to_datetime(polarity_df['date'])

if 'date' not in df.columns:
    df = df.reset_index()

merged_df = pd.merge(polarity_df, df[['date', 'price_usd']], on='date', how='outer').sort_values('date')
merged_df = merged_df.ffill()

# Calculate smoothened versions with 30-day window
window_size = 30
merged_df['smooth_compound'] = merged_df['compound'].rolling(window=window_size).mean()
merged_df['smooth_price'] = merged_df['price_usd'].rolling(window=window_size).mean()

merged_df = merged_df.dropna()

print("Item name: " + ITEM)

# 1. Spearman's Correlation with smoothened data
spearman_corr, spearman_p = spearmanr(merged_df['smooth_compound'], merged_df['smooth_price'])
print(f"Spearman's correlation (smoothened): {spearman_corr}, p-value: {spearman_p}")

# 2. Pearson's Correlation with smoothened data
pearson_corr, pearson_p = pearsonr(merged_df['smooth_compound'], merged_df['smooth_price'])
print(f"Pearson's correlation (smoothened): {pearson_corr}, p-value: {pearson_p}")

# 3. Stationarity Test (ADF) for both smoothened series
adf_price = adfuller(merged_df['smooth_price'])
adf_compound = adfuller(merged_df['smooth_compound'])

print("\nStationarity Tests (smoothened data):")
print(f"Smoothed Price - ADF Statistic: {adf_price[0]}, p-value: {adf_price[1]}")
print(f"Smoothed Compound - ADF Statistic: {adf_compound[0]}, p-value: {adf_compound[1]}")

Item name: M4A1-S | Golden Coil (Factory New)


Spearman's correlation (smoothened): 0.2401658180306609, p-value: 1.408581617757981e-44
Pearson's correlation (smoothened): 0.02094553114412399, p-value: 0.22866035585008082

Stationarity Tests (smoothened data):
Smoothed Price - ADF Statistic: -0.19957005419895746, p-value: 0.9385728464220934
Smoothed Compound - ADF Statistic: -2.9423862191107277, p-value: 0.040636429648983015
