# Global Financial Market Data Harmonization Project

## Problem Statement
Despite post-2008 financial crisis reforms in derivatives trading and reporting, financial regulators face significant challenges in obtaining a comprehensive view of market activity due to:
- Inconsistent data formats across different jurisdictions
- Varying reporting standards between exchanges
- Multiple currency denominations
- Lack of standardized data elements
- Difficulties in cross-border data sharing and analysis

## Objective
Develop a machine learning solution to:
1. Harmonize financial market data across different exchanges
2. Create standardized reporting formats
3. Enable real-time risk assessment across global markets
4. Facilitate regulatory oversight through unified data analysis



1. RSI → Momentum and Reversal Points
2. MACD → Trend Changes and Strength
3. Volume → Confirmation of Price Moves
4. Volatility → Risk Levels
5. Bollinger Bands → Price Channels and Extremes

In [5]:
!pip install keras tensorflow --upgrade



In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings('ignore')

In [46]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# ML models and preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, IsolationForest
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

class MarketAnalyzer:
    def __init__(self, csv_path):
        """Initialize with CSV file path"""
        self.csv_path = csv_path
        self.df = None
        self.models = {}
        self.predictions = {}
        self.scaler = MinMaxScaler()

    def load_data(self):
        """Load and prepare CSV data"""
        try:
            # Read CSV file
            self.df = pd.read_csv(self.csv_path)
            print(f"Successfully loaded data with shape: {self.df.shape}")
            print("\nColumns found:", self.df.columns.tolist())

            # Convert Date column
            if 'Date' in self.df.columns:
                self.df['Date'] = pd.to_datetime(self.df['Date'])
            else:
                # Try to find date column with different case
                date_col = [col for col in self.df.columns if col.lower() == 'date']
                if date_col:
                    self.df['Date'] = pd.to_datetime(self.df[date_col[0]])
                    self.df = self.df.rename(columns={date_col[0]: 'Date'})

            # Sort by date
            self.df = self.df.sort_values('Date')
            print("Data sorted by date.")
            return self.df

        except Exception as e:
            print(f"Error loading CSV file: {str(e)}")
            raise

    def create_features(self):
        """Create technical indicators for analysis"""
        df = self.df.copy()

        # Basic price features
        df['Returns'] = df['Close'].pct_change()
        df['Log_Returns'] = np.log(df['Close']/df['Close'].shift(1))

        # Moving averages
        for window in [5, 20, 50]:
            df[f'MA{window}'] = df['Close'].rolling(window=window).mean()

        # Volatility
        df['Volatility'] = df['Returns'].rolling(window=20).std()

        # RSI
        delta = df['Close'].diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
        rs = gain / loss
        df['RSI'] = 100 - (100 / (1 + rs))

        # MACD
        exp1 = df['Close'].ewm(span=12, adjust=False).mean()
        exp2 = df['Close'].ewm(span=26, adjust=False).mean()
        df['MACD'] = exp1 - exp2

        # Bollinger Bands
        df['BB_middle'] = df['Close'].rolling(window=20).mean()
        df['BB_upper'] = df['BB_middle'] + 2*df['Close'].rolling(window=20).std()
        df['BB_lower'] = df['BB_middle'] - 2*df['Close'].rolling(window=20).std()

        # Volume features
        df['Volume_MA'] = df['Volume'].rolling(window=20).mean()
        df['Volume_Rate'] = df['Volume']/df['Volume_MA']

        # Drop NaN values
        self.df = df.dropna()

        print(f"Features created. Data now has {self.df.shape[0]} rows and {self.df.shape[1]} columns.")
        print(self.df.head())  # Inspecting the data after feature creation

        return self.df

    def prepare_sequences(self, sequence_length=10):
        """Prepare sequences for LSTM"""
        features = ['Returns', 'Log_Returns', 'MA5', 'MA20', 'MA50',
                    'Volatility', 'RSI', 'MACD', 'Volume_Rate']

        # Scale features
        scaled_data = self.scaler.fit_transform(self.df[features])
        print(f"Scaled data shape: {scaled_data.shape}")

        X = []
        y = []

        for i in range(len(scaled_data) - sequence_length):
            X.append(scaled_data[i:(i + sequence_length)])
            y.append(self.df['Returns'].values[i + sequence_length])

        X, y = np.array(X), np.array(y)
        print(f"Prepared {X.shape[0]} sequences for LSTM model.")
        return X, y

    def build_lstm_model(self, input_shape):
        """Build LSTM model"""
        model = Sequential([
            LSTM(100, return_sequences=True, input_shape=input_shape),
            Dropout(0.2),
            LSTM(50, return_sequences=False),
            Dropout(0.2),
            Dense(25),
            Dense(1)
        ])
        model.compile(optimizer='adam', loss='mse', metrics=['mae'])
        return model

    def train_models(self, X_lstm, y):
        """Train multiple ML models"""
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X_lstm, y, test_size=0.2, shuffle=False)

        # Train LSTM
        lstm_model = self.build_lstm_model(X_lstm.shape[1:])
        lstm_history = lstm_model.fit(
            X_train, y_train,
            epochs=50,
            batch_size=32,
            validation_split=0.1,
            verbose=1
        )
        self.models['LSTM'] = lstm_model
        self.predictions['LSTM'] = lstm_model.predict(X_test)

        # Train traditional ML models
        features = ['Returns', 'Log_Returns', 'MA5', 'MA20', 'Volatility', 'RSI', 'MACD']
        X_trad = self.df[features].values[len(X_lstm):]
        y_trad = y

        models_config = {
            'RF': RandomForestRegressor(n_estimators=100, random_state=42),
            'XGB': XGBRegressor(n_estimators=100, learning_rate=0.1),
            'SVR': SVR(kernel='rbf'),
            'GB': GradientBoostingRegressor(n_estimators=100)
        }

        for name, model in models_config.items():
            model.fit(X_trad[:-len(X_test)], y_trad[:-len(X_test)])
            self.models[name] = model
            self.predictions[name] = model.predict(X_trad[-len(X_test):])

        return y_test, lstm_history

    def detect_anomalies(self):
        """Detect market anomalies"""
        features = ['Returns', 'Volatility', 'Volume_Rate']
        iso_forest = IsolationForest(contamination=0.1, random_state=42)
        return iso_forest.fit_predict(self.df[features]) == -1

    def plot_results(self, y_test, anomalies, history):
        """Create visualizations"""
        plt.style.use('seaborn')
        fig = plt.figure(figsize=(20, 15))

        # Model predictions
        ax1 = plt.subplot(321)
        ax1.plot(y_test, label='Actual', color='black')
        for name, pred in self.predictions.items():
            ax1.plot(pred, label=name, alpha=0.7)
        ax1.set_title('Model Predictions Comparison')
        ax1.legend()

        # Performance metrics
        ax2 = plt.subplot(322)
        metrics = {name: {
            'MSE': mean_squared_error(y_test, pred),
            'R2': r2_score(y_test, pred)
        } for name, pred in self.predictions.items()}

        model_names = list(metrics.keys())
        mse_scores = [m['MSE'] for m in metrics.values()]
        r2_scores = [m['R2'] for m in metrics.values()]

        x = np.arange(len(model_names))
        width = 0.35
        ax2.bar(x - width/2, mse_scores, width, label='MSE')
        ax2.bar(x + width/2, r2_scores, width, label='R2')
        ax2.set_xticks(x)
        ax2.set_xticklabels(model_names, rotation=45)
        ax2.set_title('Model Performance Metrics')
        ax2.legend()

        # Anomaly detection
        ax3 = plt.subplot(323)
        ax3.scatter(self.df.index[anomalies],
                   self.df['Close'][anomalies],
                   color='red',
                   label='Anomaly')
        ax3.plot(self.df.index, self.df['Close'],
                color='blue',
                alpha=0.7,
                label='Price')
        ax3.set_title('Anomaly Detection')
        ax3.legend()

        # Feature correlations
        ax4 = plt.subplot(324)
        features = ['Returns', 'Volatility', 'RSI', 'MACD', 'Volume_Rate']
        sns.heatmap(self.df[features].corr(), annot=True, cmap='coolwarm', ax=ax4)
        ax4.set_title('Feature Correlations')

        plt.tight_layout()
        plt.show()

# Example of calling plot_results after model training
def main(csv_path):
    """Main execution function"""
    print(f"Starting analysis of {csv_path}")

    # Initialize analyzer
    analyzer = MarketAnalyzer(csv_path)

    # Load and process data
    print("Loading data...")
    analyzer.load_data()

    print("Creating features...")
    analyzer.create_features()

    # Prepare sequences for LSTM model
    X_lstm, y = analyzer.prepare_sequences()

    # Train models
    y_test, lstm_history = analyzer.train_models(X_lstm, y)

    # Detect anomalies
    anomalies = analyzer.detect_anomalies()

    # Plot results after model training
    analyzer.plot_results(y_test, anomalies, lstm_history)

if __name__ == "__main__":
    # Specify your CSV file path
    csv_file = "/indexProcessed.csv"  # Replace with your CSV file path

    # Run analysis
    main(csv_file)


if __name__ == "__main__":
    # Specify your CSV file path
    csv_file = "/indexProcessed.csv"  # Replace with your CSV file path

    # Run analysis
    analyzer = main(csv_file)


Starting analysis of /indexProcessed.csv
Loading data...
Successfully loaded data with shape: (104224, 9)

Columns found: ['Index', 'Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'CloseUSD']
Data sorted by date.
Creating features...
Features created. Data now has 89866 rows and 22 columns.
        Index       Date         Open         High          Low        Close  \
25896    IXIC 1984-10-11   244.199997   244.699997   244.199997   244.699997   
13206     NYA 1984-10-11   993.609985   993.609985   993.609985   993.609985   
25897    IXIC 1984-10-12   245.500000   246.199997   245.500000   246.199997   
13207     NYA 1984-10-12  1002.280029  1002.280029  1002.280029  1002.280029   
67345  GSPTSE 1984-10-12  2370.600098  2388.100098  2370.600098  2386.699951   

         Adj Close      Volume     CloseUSD   Returns  ...         MA20  \
25896   244.699997  62860000.0   244.699997 -0.896572  ...  3464.469513   
13206   993.609985         0.0   993.609985  3.060523  ...  34

ValueError: Found array with 0 sample(s) (shape=(0, 7)) while a minimum of 1 is required by RandomForestRegressor.