# Binance Price Tracking Data Analysis

This notebook provides analysis of cryptocurrency price movements following alert triggers.

## Dataset Overview
- **Source**: Automated tracking of price movements for 1 hour after alerts
- **Features**: Price data, volatility metrics, market behavior, timing features
- **Use Cases**: Predictive modeling, pattern recognition, strategy optimization

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import our custom data science utilities
from data_science_utils import TrackingDataProcessor

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
# Load and explore the dataset
processor = TrackingDataProcessor()
processor.print_dataset_summary()

# Load analytics DataFrame
df = processor.to_analytics_dataframe()
print(f"\nDataFrame shape: {df.shape}")
df.head()

## 1. Data Exploration

In [None]:
# Basic statistics
print("Dataset Info:")
print(df.info())
print("\nNumerical Features Summary:")
df.describe().round(4)

In [None]:
# Distribution of key metrics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Final change distribution
axes[0,0].hist(df['final_change_percent'], bins=30, alpha=0.7, color='skyblue')
axes[0,0].axvline(0, color='red', linestyle='--', alpha=0.7)
axes[0,0].set_title('Distribution of Final Price Changes')
axes[0,0].set_xlabel('Final Change (%)')
axes[0,0].set_ylabel('Frequency')

# Volatility distribution
axes[0,1].hist(df['volatility_stdev'], bins=30, alpha=0.7, color='lightcoral')
axes[0,1].set_title('Distribution of Volatility')
axes[0,1].set_xlabel('Volatility (Standard Deviation %)')
axes[0,1].set_ylabel('Frequency')

# Alert direction vs final change
for direction in df['alert_direction_name'].unique():
    subset = df[df['alert_direction_name'] == direction]
    axes[1,0].hist(subset['final_change_percent'], alpha=0.6, label=f'{direction.title()} alerts', bins=20)
axes[1,0].axvline(0, color='red', linestyle='--', alpha=0.7)
axes[1,0].set_title('Final Change by Alert Direction')
axes[1,0].set_xlabel('Final Change (%)')
axes[1,0].set_ylabel('Frequency')
axes[1,0].legend()

# Threshold vs performance
axes[1,1].scatter(df['threshold_value'], df['final_change_percent'], alpha=0.6)
axes[1,1].set_title('Alert Threshold vs Final Performance')
axes[1,1].set_xlabel('Alert Threshold (%)')
axes[1,1].set_ylabel('Final Change (%)')

plt.tight_layout()
plt.show()

## 2. Symbol Analysis

In [None]:
# Performance by symbol
symbol_stats = df.groupby('symbol').agg({
    'final_change_percent': ['mean', 'std', 'count'],
    'volatility_stdev': 'mean',
    'positive_move_ratio': 'mean'
}).round(4)

symbol_stats.columns = ['avg_change', 'change_std', 'alert_count', 'avg_volatility', 'positive_ratio']
symbol_stats = symbol_stats[symbol_stats['alert_count'] >= 3]  # Filter symbols with at least 3 alerts
symbol_stats = symbol_stats.sort_values('avg_change', ascending=False)

print("Symbol Performance (min 3 alerts):")
print(symbol_stats.head(10))

In [None]:
# Visualize top performing symbols
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Top 10 by average performance
top_10 = symbol_stats.head(10)
axes[0].barh(range(len(top_10)), top_10['avg_change'], color='green', alpha=0.7)
axes[0].set_yticks(range(len(top_10)))
axes[0].set_yticklabels(top_10.index)
axes[0].set_xlabel('Average Final Change (%)')
axes[0].set_title('Top 10 Performing Symbols')
axes[0].axvline(0, color='red', linestyle='--', alpha=0.7)

# Volatility vs Performance scatter
axes[1].scatter(symbol_stats['avg_volatility'], symbol_stats['avg_change'], 
               s=symbol_stats['alert_count']*10, alpha=0.6)
axes[1].set_xlabel('Average Volatility (%)')
axes[1].set_ylabel('Average Final Change (%)')
axes[1].set_title('Volatility vs Performance (bubble size = alert count)')
axes[1].axhline(0, color='red', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

## 3. Temporal Analysis

In [None]:
# Performance by time of day and day of week
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Hour of day analysis
hourly_performance = df.groupby('session_start_hour')['final_change_percent'].agg(['mean', 'count'])
hourly_performance = hourly_performance[hourly_performance['count'] >= 2]  # Min 2 alerts per hour

axes[0].bar(hourly_performance.index, hourly_performance['mean'], alpha=0.7, color='lightblue')
axes[0].axhline(0, color='red', linestyle='--', alpha=0.7)
axes[0].set_xlabel('Hour of Day (UTC)')
axes[0].set_ylabel('Average Final Change (%)')
axes[0].set_title('Performance by Hour of Day')
axes[0].set_xticks(range(0, 24, 2))

# Day of week analysis
weekday_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
weekly_performance = df.groupby('session_start_weekday')['final_change_percent'].agg(['mean', 'count'])

axes[1].bar(range(7), weekly_performance['mean'], alpha=0.7, color='lightgreen')
axes[1].axhline(0, color='red', linestyle='--', alpha=0.7)
axes[1].set_xlabel('Day of Week')
axes[1].set_ylabel('Average Final Change (%)')
axes[1].set_title('Performance by Day of Week')
axes[1].set_xticks(range(7))
axes[1].set_xticklabels(weekday_names)

plt.tight_layout()
plt.show()

## 4. Correlation Analysis

In [None]:
# Correlation matrix of key features
correlation_features = [
    'threshold_value', 'alert_direction', 'monitoring_window_minutes',
    'final_change_percent', 'max_change_percent', 'min_change_percent',
    'volatility_stdev', 'positive_move_ratio', 'session_start_hour',
    'actual_duration_seconds', 'total_data_points'
]

available_features = [f for f in correlation_features if f in df.columns]
corr_matrix = df[available_features].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', center=0, 
            square=True, fmt='.2f', cbar_kws={"shrink": .8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Key correlations with final performance
final_change_corr = corr_matrix['final_change_percent'].abs().sort_values(ascending=False)
print("\nStrongest correlations with final_change_percent:")
print(final_change_corr.head(10))

## 5. Time Series Analysis (Sample)

In [None]:
# Load time series data and analyze a few interesting sessions
ts_df = processor.to_price_timeseries_dataframe()

if not ts_df.empty:
    # Find sessions with extreme performance
    session_stats = df.groupby('session_id')['final_change_percent'].first().sort_values()
    
    # Plot best and worst performing sessions
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Best performer
    best_session = session_stats.index[-1]
    best_data = ts_df[ts_df['session_id'] == best_session]
    if not best_data.empty:
        axes[0,0].plot(best_data['normalized_time'], best_data['change_from_initial'])
        axes[0,0].set_title(f'Best Performer: {best_data.iloc[0]["symbol"]} ({session_stats.iloc[-1]:.2f}%)')
        axes[0,0].set_xlabel('Normalized Time (0=start, 1=end)')
        axes[0,0].set_ylabel('Change from Initial (%)')
        axes[0,0].axhline(0, color='red', linestyle='--', alpha=0.5)
        axes[0,0].grid(True, alpha=0.3)
    
    # Worst performer
    worst_session = session_stats.index[0]
    worst_data = ts_df[ts_df['session_id'] == worst_session]
    if not worst_data.empty:
        axes[0,1].plot(worst_data['normalized_time'], worst_data['change_from_initial'], color='red')
        axes[0,1].set_title(f'Worst Performer: {worst_data.iloc[0]["symbol"]} ({session_stats.iloc[0]:.2f}%)')
        axes[0,1].set_xlabel('Normalized Time (0=start, 1=end)')
        axes[0,1].set_ylabel('Change from Initial (%)')
        axes[0,1].axhline(0, color='red', linestyle='--', alpha=0.5)
        axes[0,1].grid(True, alpha=0.3)
    
    # Average trajectory for up vs down alerts
    for idx, alert_type in enumerate(['up', 'down']):
        alert_sessions = df[df['threshold_type'] == alert_type]['session_id'].tolist()
        alert_ts_data = ts_df[ts_df['session_id'].isin(alert_sessions)]
        
        if not alert_ts_data.empty:
            # Create time bins and average
            time_bins = np.linspace(0, 1, 21)  # 20 intervals
            binned_data = []
            
            for i in range(len(time_bins)-1):
                mask = (alert_ts_data['normalized_time'] >= time_bins[i]) & \
                       (alert_ts_data['normalized_time'] < time_bins[i+1])
                if mask.any():
                    avg_change = alert_ts_data[mask]['change_from_initial'].mean()
                    binned_data.append(avg_change)
                else:
                    binned_data.append(np.nan)
            
            time_centers = (time_bins[:-1] + time_bins[1:]) / 2
            axes[1,idx].plot(time_centers, binned_data, marker='o', 
                           label=f'Average trajectory ({len(alert_sessions)} sessions)')
            axes[1,idx].set_title(f'Average Trajectory: {alert_type.upper()} Alerts')
            axes[1,idx].set_xlabel('Normalized Time')
            axes[1,idx].set_ylabel('Average Change from Initial (%)')
            axes[1,idx].axhline(0, color='red', linestyle='--', alpha=0.5)
            axes[1,idx].grid(True, alpha=0.3)
            axes[1,idx].legend()
    
    plt.tight_layout()
    plt.show()
else:
    print("No time series data available")

## 6. Machine Learning Preparation

In [None]:
# Prepare data for machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Get feature matrix and target
X, y = processor.get_feature_matrix_for_ml(target_column='final_change_percent')

if len(X) > 10:  # Need at least 10 samples
    print(f"Feature matrix shape: {X.shape}")
    print(f"Target vector shape: {y.shape}")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train a simple model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred = model.predict(X_test_scaled)
    
    # Metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\nModel Performance:")
    print(f"MSE: {mse:.4f}")
    print(f"R² Score: {r2:.4f}")
    
    # Feature importance
    feature_names = [
        'start_hour', 'weekday', 'duration', 'threshold', 'alert_dir', 'window_min',
        'initial_price', 'max_change', 'min_change', 'volatility', 'variance',
        'avg_abs_change', 'positive_ratio', 'data_points', 'data_freq'
    ][:X.shape[1]]  # Adjust to actual number of features
    
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(importance_df.head(10))
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(importance_df.head(10))), importance_df.head(10)['importance'])
    plt.yticks(range(len(importance_df.head(10))), importance_df.head(10)['feature'])
    plt.xlabel('Feature Importance')
    plt.title('Top 10 Feature Importances for Predicting Final Change')
    plt.tight_layout()
    plt.show()
    
else:
    print(f"Insufficient data for ML analysis. Found {len(X)} samples, need at least 10.")

## 7. Export Data for External Analysis

In [None]:
# Export processed data to CSV files
processor.export_to_csv("analysis_exports")

print("\n📁 Data exported to 'analysis_exports/' directory")
print("Files created:")
print("- tracking_analytics.csv: Main analytics dataset")
print("- price_timeseries.csv: Detailed price movements")
print("- symbol_statistics.csv: Summary stats by symbol")
print("\nThese files can be used in other tools like Excel, R, or other Python environments.")

## Conclusions and Next Steps

This analysis provides insights into:
1. **Symbol Performance**: Which cryptocurrencies perform better after alerts
2. **Temporal Patterns**: Best times of day/week for alert effectiveness
3. **Volatility Analysis**: Relationship between volatility and performance
4. **Predictive Features**: Which factors are most important for predicting outcomes

### Potential Extensions:
- **Deep Learning**: LSTM models for time series prediction
- **Clustering**: Group similar price movement patterns
- **Anomaly Detection**: Identify unusual market behaviors
- **Strategy Optimization**: Use ML to optimize alert thresholds
- **Real-time Prediction**: Deploy models for live trading decisions