# BVMT Exploratory Data Analysis

This notebook explores historical BVMT market data and validates core modeling choices used in the Trading Assistant.

**Dataset**: BVMT historical quotes (CSV files)

**Objectives**:
- Understand data quality and coverage
- Identify liquid stocks and volatility profiles
- Compare forecasting baselines (Prophet vs Simple MA)
- Inspect anomaly patterns in 2022
- Demonstrate sentiment scoring logic
- Run a lightweight recommendation backtest

In [None]:
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

warnings.filterwarnings('ignore')
sns.set_theme(style='whitegrid')
%matplotlib inline

## Data Loading and Overview

In [None]:
from modules.shared.data_loader import load_full_dataset

df = load_full_dataset()
df.head(5)

In [None]:
print('Shape:', df.shape)
print('Columns:', df.columns.tolist())
df.dtypes

In [None]:
df.describe(include='all')

## Data Quality Analysis

In [None]:
missing = df.isna().mean().sort_values(ascending=False)
missing.head(10)

In [None]:
zero_volume = df[df['volume'] == 0]
print('Zero-volume rows:', len(zero_volume))

date_range = (df['date'].min(), df['date'].max())
print('Date range:', date_range)

plt.figure(figsize=(8, 4))
sns.histplot(df['volume'], bins=50, log_scale=(False, True))
plt.title('Volume Distribution (log scale)')
plt.xlabel('Volume')
plt.ylabel('Count')
plt.show()

## Stock Selection Analysis

In [None]:
stock_stats = df.groupby('stock_code').agg({
    'volume': 'mean',
    'close': 'mean',
    'date': 'count'
}).rename(columns={'date': 'num_days'})

top10 = stock_stats.sort_values('volume', ascending=False).head(10)
top10

In [None]:
plt.figure(figsize=(10, 4))
sns.barplot(x=top10.index, y=top10['volume'])
plt.title('Top 10 Stocks by Average Volume')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Avg Volume')
plt.show()

volatility = df.groupby('stock_code')['close'].pct_change().groupby(df['stock_code']).std()
vol_df = volatility.dropna().sort_values(ascending=False).head(10)
vol_df

In [None]:
# Correlation analysis on top 6 liquid stocks
top6 = top10.index[:6].tolist()
pivot = df[df['stock_code'].isin(top6)].pivot_table(index='date', columns='stock_code', values='close')
corr = pivot.pct_change().corr()
plt.figure(figsize=(6, 5))
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Return Correlation (Top 6)')
plt.show()

## Forecasting Model Comparison

In [None]:
from math import sqrt

def simple_ma_forecast(series, window=5, horizon=5):
    ma = series.rolling(window=window).mean()
    last_ma = ma.iloc[-1]
    return np.repeat(last_ma, horizon)

def rmse(y_true, y_pred):
    return sqrt(np.mean((y_true - y_pred) ** 2))

def mae(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

try:
    from prophet import Prophet
    prophet_available = True
except Exception:
    prophet_available = False

stocks = top10.index[:5].tolist()
results = []

for code in stocks:
    series = df[df['stock_code'] == code].sort_values('date')['close']
    train = series[:-5]
    test = series[-5:]

    # Simple MA
    ma_pred = simple_ma_forecast(train, window=5, horizon=5)
    results.append({
        'stock_code': code,
        'model': 'Simple MA',
        'RMSE': rmse(test.values, ma_pred),
        'MAE': mae(test.values, ma_pred)
    })

    # Prophet
    if prophet_available:
        tmp = df[df['stock_code'] == code].sort_values('date')[['date', 'close']]
        tmp = tmp.rename(columns={'date': 'ds', 'close': 'y'})
        model = Prophet(daily_seasonality=False, weekly_seasonality=False, yearly_seasonality=True)
        model.fit(tmp.iloc[:-5])
        future = model.make_future_dataframe(periods=5, freq='D')
        forecast = model.predict(future).tail(5)['yhat'].values
        results.append({
            'stock_code': code,
            'model': 'Prophet',
            'RMSE': rmse(test.values, forecast),
            'MAE': mae(test.values, forecast)
        })

res_df = pd.DataFrame(results)
res_df

In [None]:
if not res_df.empty:
    fig = px.bar(res_df, x='stock_code', y='RMSE', color='model', barmode='group',
                 title='RMSE Comparison: Prophet vs Simple MA')
    fig.show()

## Anomaly Detection Examples

In [None]:
from modules.anomaly.detector import detect_anomalies

sample_code = top10.index[0]
anoms = detect_anomalies(sample_code, lookback_days=30)
anoms.get('anomalies_detected', [])[:5]

In [None]:
# Visualize volume spikes for sample stock
stock_df = df[df['stock_code'] == sample_code].sort_values('date')
plt.figure(figsize=(10, 4))
plt.plot(stock_df['date'], stock_df['volume'], label='Volume')
plt.title(f'Volume Series: {sample_code}')
plt.xlabel('Date')
plt.ylabel('Volume')
plt.show()

## Sentiment Analysis Demo

In [None]:
headlines = [
    'Résultats solides pour la banque, croissance du bénéfice',
    'Baisse inattendue du chiffre d'affaires',
    'Nouvel investissement majeur annoncé',
    'Crainte d'inflation et pression sur les marges'
]
positive_keywords = ['croissance', 'solides', 'hausse', 'investissement', 'bénéfice']
negative_keywords = ['baisse', 'crainte', 'pression', 'inflation']

def score_headline(text):
    t = text.lower()
    score = 0
    for w in positive_keywords:
        if w in t:
            score += 1
    for w in negative_keywords:
        if w in t:
            score -= 1
    return score

scores = [score_headline(h) for h in headlines]
pd.DataFrame({'headline': headlines, 'score': scores})

In [None]:
plt.figure(figsize=(6, 3))
sns.histplot(scores, bins=5)
plt.title('Sentiment Score Distribution (Demo)')
plt.show()

## Recommendation Backtest (1 Month)

In [None]:
from modules.decision.engine import make_recommendation

def backtest_one_month(stock_code, capital=10000):
    stock_df = df[df['stock_code'] == stock_code].sort_values('date')
    if len(stock_df) < 25:
        return None
    window = stock_df.tail(22)
    cash = capital
    shares = 0
    equity_curve = []
    
    for _, row in window.iterrows():
        price = row['close']
        rec = make_recommendation(stock_code, 'moderate')
        if rec['recommendation'] == 'BUY' and cash > price:
            shares = int(cash / price)
            cash -= shares * price
        elif rec['recommendation'] == 'SELL' and shares > 0:
            cash += shares * price
            shares = 0
        equity_curve.append(cash + shares * price)
    return equity_curve

bt_stock = top10.index[0]
curve = backtest_one_month(bt_stock)
if curve:
    plt.figure(figsize=(8, 3))
    plt.plot(curve, label='Strategy')
    plt.title(f'Equity Curve (1 Month) - {bt_stock}')
    plt.legend()
    plt.show()

## Conclusions

- Data quality is sufficient for basic forecasting and anomaly detection.
- Liquidity varies significantly across stocks; filters are essential.
- Simple MA provides a strong baseline; Prophet is more flexible when available.
- Anomalies appear around volume/price spikes and low liquidity.
- The recommendation system is effective for ranking signals but needs longer backtests.