# 1 Imports

In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from statsmodels.stats.outliers_influence import variance_inflation_factor

# 2 Data Loading

In [41]:
df = pd.read_csv('Data/augmento_btc.csv')
# convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

In [42]:
print(df.shape)
print(df.columns)
df.head()

(71869, 281)
Index(['date', 'listing_close', 'twitter_hacks',
       'twitter_pessimistic_doubtful', 'twitter_banks', 'twitter_selling',
       'twitter_market_manipulation', 'twitter_de_centralisation',
       'twitter_angry', 'twitter_etf',
       ...
       'reddit_price', 'reddit_use_case_applications', 'reddit_rumor',
       'reddit_scam_fraud', 'reddit_airdrop', 'reddit_optimistic',
       'reddit_negative'],
      dtype='object', length=281)


Unnamed: 0,date,listing_close,twitter_hacks,twitter_pessimistic_doubtful,twitter_banks,twitter_selling,twitter_market_manipulation,twitter_de_centralisation,twitter_angry,twitter_etf,...,reddit_buying,reddit_warning,reddit_annoyed_frustrated,reddit_price,reddit_use_case_applications,reddit_rumor,reddit_scam_fraud,reddit_airdrop,reddit_optimistic,reddit_negative
0,2016-11-01 23:00:00,726.6,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,2.0,6.0
1,2016-11-02 00:00:00,721.96,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,1.0,0.0,0.0,0.0,3.0,3.0
2,2016-11-02 01:00:00,722.49,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,4.0,0.0,0.0,2.0,0.0,3.0,12.0
3,2016-11-02 02:00:00,721.66,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,4.0
4,2016-11-02 03:00:00,724.89,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,1.0,0.0,0.0,2.0,5.0,0.0,1.0,0.0,2.0,11.0


In [47]:
df["date"].sort_values(ascending=True)

0       2016-11-01 23:00:00
1       2016-11-02 00:00:00
2       2016-11-02 01:00:00
3       2016-11-02 02:00:00
4       2016-11-02 03:00:00
                ...        
71864   2025-01-13 07:00:00
71865   2025-01-13 08:00:00
71866   2025-01-13 09:00:00
71867   2025-01-13 10:00:00
71868   2025-01-13 11:00:00
Name: date, Length: 71869, dtype: datetime64[ns]

In [51]:
df[df["date"] == "2024-01-01 04:00:00"][["date", "twitter_bearish"]]

Unnamed: 0,date,twitter_bearish
62789,2024-01-01 04:00:00,2.0


In [14]:
import requests
import pprint
import json
import os

if os.path.exists('Data/API_Summary/augmento_sentiment_topics.json'):
    print("Directory 'Data/API_Summary/augmento_sentiment_topics.json' exists")
else:
    print("Directory does not exist. Requesting new info from API and creating summary as json file.")
    r_sentiment_topics = requests.request("GET", "https://api.augmento.ai/v0.1/topics")
    with open('Data/API_Summary/augmento_sentiment_topics.json', 'w') as f:
        json.dump(r_sentiment_topics.json(), f, indent=4)
    
if os.path.exists('Data/API_Summary/augmento_coins.json'):
    print("Directory 'Data/API_Summary/augmento_coins.json' exists")
else:
    print("Directory does not exist. Requesting new info from API and creating summary as json file.")
    r_coins = requests.request("GET", "https://api.augmento.ai/v0.1/coins")
    with open('Data/API_Summary/augmento_coins.json', 'w') as f:
        json.dump(r_coins.json(), f, indent=4)

Directory 'Data/API_Summary/augmento_sentiment_topics.json' exists
Directory 'Data/API_Summary/augmento_coins.json' exists


In [None]:
augmento_sentiment_categories = {
    "negative": {
        "bearish": ["twitter_bearish", "reddit_bearish", "news_bearish"],
        "fear": ["twitter_fear", "reddit_fear", "news_fear"],
        "panic": ["twitter_panic", "reddit_panic", "news_panic"],
        "uncertainty": ["twitter_uncertainty", "reddit_uncertainty", "news_uncertainty"],
    },
    "positive": {

    }
}

# 🔍 How Augmento.ai Strategy Works (Overview)

Augmento uses a **topic-based-sentiment-analysis model** that breaks down online conversations across three major platforms (**Twitter, Bitcointalk, Reddit**).
They then break down online conversations into specific **sentiment topics** as listed in the following table:

## Sentiment Topics:

<img src="Assets/augmento_topics.png" alt="Alt text" width="600" />

## How it works

1. **Social Sentiment Monitoring**: Augmento scrapes and processes data from sources like Reddit, Twitter, Bitcointalk, etc., to capture what people are saying about Bitcoin.

2. **Topic Detection**: Each post or message is categorized into one or more of the **90+ fine-grained topics** (from “Bullish” and “Hodling” to “Hacks” or “FUD_theme”).

3. **Real-Time Trend Analysis**: It monitors how frequently these topics appear over time and how they correlate with market movements.

4. **Market Signals**: The assumption is that increased activity in some topics precedes market trends, e.g., a spike in "FOMO", "Buying", or "Bullish" topics might signal a short-term rally.


# 🪐 Development Strategy Outline

Based on the provided information, we will now proceed to create the following code:

1. Class named **AugmentoAPIClient** to build a dataset and structure the information retrieved from Augmento useful to recreate the dataset from scratch.  
2. Class Named **AugmentoVisualizer** to create basic visuals for the retrieved data supporting the Time Series Analysis (TSA)
3. Class Named **AugmentoExperimentModeller** to build a pipeline to experiment with the data and train and evaluate different models.
4. Class Named **AugmentoPreprocessor** to preprocess the dataset from the AugmentoAPI client in order to perform time series classifications.
5. Class Named **AugmentoModelInterpretor** able to perform different model interpretation strategies.
6. **AugmentoStreamlitApp** to build an App that showcases the work.



# 📈🧠 Time Series Analysis Summary

Outline:
Time series data, multivariate time series in particular consists of multiple variables recorded over time.
It can have high dimensionality and temporal correlation (e.g. stock prices or weather patterns). Classic characteristics of time series are **temporal dependencies**, **seasonality** or **autocorrelation** which might hide underlying patterns or trends within the data. 
In a time series, individual time points (samples) may be highly influenced by previous samples referred to as **time-dependency**. 
**Autocorrelation** referrs to datapoints not only being related to the previous one but also correlated to multiple past other points within the series.

#### 🎯 Goal: 
Understand the intricate characteristics of the time series to understand useful Feature Engineering steps for the time series classifier and potentially other experiments. 

#### 📥 Input: 
Sentiment and Market related features  as time series 
- engineered features from the previous step
- bitcoin price information
- sentiment features

#### 📋 List of analysis steps:

0. Data Wrangling (Creating the Dataset that will be used in the Analysis)
        
        Write and extend the DataWrangling Class to support the preprocessing of the data. 


1. Features and target behaviour over time (curve fitting)

        Line Chart over time
        Rolling statistics (moving average etc.)
        Zoom in/out (look at hourly values, daily, weekly etc.)
        Class wise visualization (Plot time series grouped by class to spot differentiating patterns.)

        Candlestick chart
        BoxPlot
        Scatterplot
        
        Autocorrelation plot

2. Descriptive analysis to detect TS specific characteristics & Stationarity Check (trends, cycles, seasonal variation)

        Statistical tests: Use the Augmented Dickey-Fuller (ADF) or KPSS test to check for stationarity.

        Differencing: If needed, difference the series to remove trend/seasonality.

        Seasonal Plot Decomposition: Apply additive or multiplicative decomposition (seasonal_decompose) to isolate trend, seasonality, and residuals.

        Fourier Transform or STL: For complex or non-fixed seasonal patterns.



3. Explanative analysis to understand the relationships within the data as well as cause and effect.



4. Autocorrelation 

        ACF/PACF plots: Detect autocorrelation and seasonality lags.

    Useful for understanding temporal dependencies that could be helpful for feature engineering or model choice.

5. Time Series Feature Selection & Engineering using [Featuretools](https://featuretools.alteryx.com/en/stable/guides/time_series.html)


6. Feature Distributions


7. [PCA for time series](https://medium.com/@heyamit10/principal-component-analysis-for-time-series-99a5d5eddac9): Helps to preserve temporal information while reducing the datasets complexity. In stock markets PCA can help to indentify dominant influencing the stocks price.  

        Sliding Window PCA
        Dynamic PCA (Recursive PCA, Dynamic Mode Decomposition (DMD))
        Frequency-Based PCA (Spectral PCA)

    PCA requires Standardization of the data, no-missing values, and removal of trends and seasonality (differencing or detrending).


8. Preprocessing of labels

        Creating the target labels
        Label Distribution & Balance

9. EDA for label distributions
        🧹 Anomaly Detection

10. Time Series Dataset creation

        Creating a statistical dataset for simple tabular classification
        Extract features such as:
        Statistical: mean, std, skew, kurtosis.
        Temporal: lag features, rolling windows.
        Frequency: FFT, wavelets.
        Shape-based: slope, area under curve, peak detection.
        Domain-specific patterns (if applicable).


        Simple + Complex Time Series dataset

10. Baseline Statistical dataset creation



#### 📁 Output: 
The Analysis with result in a variety of different datasets from the most basic dataset, over transformed  features up to category-specific datasets.

In [15]:
import plotly.graph_objects as go
import pandas as pd

def plot_btc_candlestick(df, date_column, start_date, end_date=None,
                          open_col='open', high_col='high', low_col='low', close_col='close'):
    # Ensure date column is in datetime format
    df = df.copy()
    df[date_column] = pd.to_datetime(df[date_column])

    # Filter by date range
    mask = (df[date_column] >= pd.to_datetime(start_date))
    if end_date:
        mask &= (df[date_column] <= pd.to_datetime(end_date))
    df_filtered = df[mask]

    # Plot
    fig = go.Figure()

    fig.add_trace(go.Candlestick(
        x=df_filtered[date_column],
        open=df_filtered[open_col],
        high=df_filtered[high_col],
        low=df_filtered[low_col],
        close=df_filtered[close_col],
        name='BTC Price'
    ))

    fig.update_layout(
        title=f'BTC Candlestick Chart from {start_date} to {end_date or "latest"}',
        xaxis_title='Date',
        yaxis_title='Price',
        xaxis_rangeslider_visible=False,
        template='plotly_dark',
        height=600
    )

    fig.show()


In [16]:
plot_btc_candlestick(
    df=df,
    date_column='date',
    open_col='open',
    high_col='high',
    low_col='low',
    close_col='close_binance',
    start_date='2024-05-30',
    #end_date='2022-12-01'
)


In [50]:
df.twitter_bullish.unique()

array([ 1.,  0.,  2.,  3.,  6.,  4.,  5., 11.,  7.,  8.,  9., 10., 16.,
       13., 12., 18., 17., 21., 15., 14., 20., 19., 30., 23., 29., 49.,
       34., 26., 22., 24., 36., 28., 25., 31., 35., 27., 52., 32., 33.,
       39., 37., 51., 41., 72., 46., 42., 50., 53., 54., 40., 55., 82.,
       65., 77., 45., 38., 48., 43., 58., 87., 70., 61.])

In [51]:
df.twitter_bearish.unique()

array([ 0.,  1.,  3.,  4.,  2.,  6.,  9.,  5.,  7., 10.,  8., 12., 14.,
       13., 11., 19., 16., 15., 23., 17., 18., 26., 20., 21., 28., 34.,
       24., 22., 29., 30., 25., 31., 33., 27.])

In [None]:
df.twitter_bearish.unique()

In [None]:
df.twitter_bearish.unique()

In [49]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd

def plot_sentiment_dashboard(df, date_column, price_col, sentiment_score_col,
                              twitter_col, reddit_col, bitcointalk_col,
                              topic_col=None):
    df = df.copy()
    df[date_column] = pd.to_datetime(df[date_column])

    # Latest sentiment score
    current_sentiment = round(df[sentiment_score_col].iloc[-1], 3)

    # Prepare hover text if topics are available
    if topic_col and topic_col in df.columns:
        hover_text = df[topic_col]
    else:
        hover_text = None

    # Subplots layout
    fig = make_subplots(
        rows=2, cols=2,
        specs=[[{"type": "indicator"}, {"type": "xy"}],
               [None, {"type": "xy"}]],
        column_widths=[0.25, 0.75],
        row_heights=[0.6, 0.4],
        vertical_spacing=0.08
    )

    # Gauge chart
    fig.add_trace(go.Indicator(
        mode="gauge+number",
        value=current_sentiment,
        title={'text': "Bitcoin sentiment"},
        gauge={
            'axis': {'range': [0, 1]},
            'bar': {'color': "gold"},
            'bgcolor': "black"
        },
        number={'font': {'color': 'white'}},
    ), row=1, col=1)

    # BTC Price and Sentiment Score
    fig.add_trace(go.Scatter(
        x=df[date_column],
        y=df[price_col],
        name='XBTUSD',
        line=dict(color='white')
    ), row=1, col=2)

    fig.add_trace(go.Scatter(
        x=df[date_column],
        y=df[sentiment_score_col],
        name='Sentiment Score',
        line=dict(color='orange'),
        hovertext=hover_text,
        hoverinfo="text+y" if hover_text is not None else "y"
    ), row=1, col=2)

    # Sentiment sources
    fig.add_trace(go.Scatter(
        x=df[date_column],
        y=df[bitcointalk_col],
        name='Bitcointalk',
        line=dict(color='gold')
    ), row=2, col=2)

    fig.add_trace(go.Scatter(
        x=df[date_column],
        y=df[reddit_col],
        name='Reddit',
        line=dict(color='orangered')
    ), row=2, col=2)

    fig.add_trace(go.Scatter(
        x=df[date_column],
        y=df[twitter_col],
        name='Twitter',
        line=dict(color='deepskyblue')
    ), row=2, col=2)

    # Layout
    fig.update_layout(
        template="plotly_dark",
        height=750,
        title="Bitcoin Social Sentiment (Hourly Updates)",
        legend=dict(title="Click to isolate sources"),
    )

    fig.update_yaxes(title_text="XBTUSD / Sentiment", row=1, col=2)
    fig.update_yaxes(title_text="Sources", range=[0, 1], row=2, col=2)

    fig.show()


plot_sentiment_dashboard(
    df=df,
    date_column='date',
    price_col='close',
    sentiment_score_col='sentiment_score',
    twitter_col='twitter_sentiment',
    reddit_col='reddit_sentiment',
    bitcointalk_col='bitcointalk_sentiment',
    topic_col='trending_topics'  # Optional: column with topic text
)

KeyError: 'sentiment_score'

In [None]:
# plot sentiment for bullish and bearish
def plot_sentiment_curve(df):
    fig = go.Figure()

    # Bullish sentiment
    fig.add_trace(go.Scatter(
        x=df['date'],
        y=df['twitter_bullish'],
        name='Twitter Bullish',
        line=dict(color='gold')
    ))

    # Bearish sentiment
    fig.add_trace(go.Scatter(
        x=df['date'],
        y=df['twitter_bearish'],
        name='Twitter Bearish',
        line=dict(color='orangered')
    ))

    # Layout
    fig.update_layout(
        title="Twitter Sentiment (Bullish vs Bearish)",
        xaxis_title="Date",
        yaxis_title="Sentiment Score",
        template="plotly_dark",
        height=600
    )

    fig.show()
plot_sentiment_curve(df)

In [None]:
df["twitter_bullish"].describe()

In [21]:
df["twitter_bullish"].describe()

count    64470.000000
mean         0.067044
std          0.959156
min         -1.809292
25%         -0.506333
50%          0.014851
75%          0.579466
max          4.705504
Name: twitter_bullish, dtype: float64

In [23]:
df["Negative"].describe()

KeyError: 'Negative'

In [24]:
# read new data from Data/augmento_btc.csv
df_original = pd.read_csv('Data/augmento_btc.csv')
df_original['date'] = pd.to_datetime(df_original['date'])


In [35]:
df_original["twitter_uncertain"].describe()

count    71869.000000
mean         2.906997
std          2.783781
min          0.000000
25%          1.000000
50%          2.000000
75%          4.000000
max         35.000000
Name: twitter_uncertain, dtype: float64

In [34]:
df["twitter_uncertain"].describe()

count    64470.000000
mean         0.014763
std          0.950134
min         -1.337792
25%         -0.547386
50%         -0.049723
75%          0.492622
max          7.356677
Name: twitter_uncertain, dtype: float64

In [37]:
# check df for negative values
# keep all ols except for date
dfcheck = df_original.copy()
dfcheck = dfcheck.drop(columns=["date"])



In [None]:
# check for negative values and show only the columns with negative values
dfcheck[dfcheck < 0] = None


listing_close                   0
twitter_hacks                   0
twitter_pessimistic_doubtful    0
twitter_banks                   0
twitter_selling                 0
                               ..
reddit_rumor                    0
reddit_scam_fraud               0
reddit_airdrop                  0
reddit_optimistic               0
reddit_negative                 0
Length: 280, dtype: int64

In [39]:
dfcheck[dfcheck < 0].count()

listing_close                   0
twitter_hacks                   0
twitter_pessimistic_doubtful    0
twitter_banks                   0
twitter_selling                 0
                               ..
reddit_rumor                    0
reddit_scam_fraud               0
reddit_airdrop                  0
reddit_optimistic               0
reddit_negative                 0
Length: 280, dtype: int64